2003-03-13 08:52:39

by Alex Tomas

[permalink] [raw]
Subject: [PATCH] concurrent block allocation for ext2 against 2.5.64


Hi!

as Andrew said, concurrent balloc for ext3 is useless because of BKL.
and I saw it in benchmarks. but it may be useful for ext2.

Results:
9/100000 9/500000 16/100000 16/500000 32/100000 32/500000
ext2: 0m9.260s 0m46.160s 0m18.133s 1m33.553s 0m35.958s 3m4.164s
ext2-ca: 0m8.578s 0m42.712s 0m17.412s 1m28.637s 0m33.736s 2m53.824s

in those benchmarks, I run 2 process, each of them writes N blocks
(9, 16, 32), truncates file and repeat these steps M times (100000, 500000).




diff -uNr linux/fs/ext2/balloc.c edited/fs/ext2/balloc.c
--- linux/fs/ext2/balloc.c Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/balloc.c Thu Mar 13 10:54:50 2003
@@ -98,9 +98,13 @@
{
struct ext2_sb_info * sbi = EXT2_SB(sb);
struct ext2_super_block * es = sbi->s_es;
- unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
- unsigned root_blocks = le32_to_cpu(es->s_r_blocks_count);
+ unsigned free_blocks;
+ unsigned root_blocks;

+ spin_lock(&sbi->s_alloc_lock);
+
+ free_blocks = le32_to_cpu(es->s_free_blocks_count);
+ root_blocks = le32_to_cpu(es->s_r_blocks_count);
if (free_blocks < count)
count = free_blocks;

@@ -113,11 +117,16 @@
*/
if (free_blocks > root_blocks)
count = free_blocks - root_blocks;
- else
+ else {
+ spin_unlock(&sbi->s_alloc_lock);
return 0;
+ }
}

es->s_free_blocks_count = cpu_to_le32(free_blocks - count);
+
+ spin_unlock(&sbi->s_alloc_lock);
+
mark_buffer_dirty(sbi->s_sbh);
sb->s_dirt = 1;
return count;
@@ -128,35 +137,54 @@
if (count) {
struct ext2_sb_info * sbi = EXT2_SB(sb);
struct ext2_super_block * es = sbi->s_es;
- unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
+ unsigned free_blocks;
+
+ spin_lock(&sbi->s_alloc_lock);
+ free_blocks = le32_to_cpu(es->s_free_blocks_count);
es->s_free_blocks_count = cpu_to_le32(free_blocks + count);
+ spin_unlock(&sbi->s_alloc_lock);
+
mark_buffer_dirty(sbi->s_sbh);
sb->s_dirt = 1;
}
}

-static inline int group_reserve_blocks(struct ext2_group_desc *desc,
+static inline int group_reserve_blocks(struct ext2_sb_info *sbi, struct ext2_group_desc *desc,
struct buffer_head *bh, int count)
{
unsigned free_blocks;

- if (!desc->bg_free_blocks_count)
+ spin_lock(&sbi->s_alloc_lock);
+
+ if (!desc->bg_free_blocks_count) {
+ spin_unlock(&sbi->s_alloc_lock);
return 0;
+ }

free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
if (free_blocks < count)
count = free_blocks;
desc->bg_free_blocks_count = cpu_to_le16(free_blocks - count);
+
+ spin_unlock(&sbi->s_alloc_lock);
+
mark_buffer_dirty(bh);
return count;
}

-static inline void group_release_blocks(struct ext2_group_desc *desc,
+static inline void group_release_blocks(struct ext2_sb_info *sbi, struct ext2_group_desc *desc,
struct buffer_head *bh, int count)
{
if (count) {
- unsigned free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
+ unsigned free_blocks;
+
+ spin_lock(&sbi->s_alloc_lock);
+
+ free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count);
+
+ spin_unlock(&sbi->s_alloc_lock);
+
mark_buffer_dirty(bh);
}
}
@@ -176,7 +204,6 @@
struct ext2_super_block * es;
unsigned freed = 0, group_freed;

- lock_super (sb);
es = EXT2_SB(sb)->s_es;
if (block < le32_to_cpu(es->s_first_data_block) ||
block + count < block ||
@@ -224,7 +251,7 @@
block, count);

for (i = 0, group_freed = 0; i < count; i++) {
- if (!ext2_clear_bit(bit + i, bitmap_bh->b_data))
+ if (!test_and_clear_bit(bit + i, (void *) bitmap_bh->b_data))
ext2_error (sb, "ext2_free_blocks",
"bit already cleared for block %lu",
block + i);
@@ -236,7 +263,7 @@
if (sb->s_flags & MS_SYNCHRONOUS)
sync_dirty_buffer(bitmap_bh);

- group_release_blocks(desc, bh2, group_freed);
+ group_release_blocks(EXT2_SB(sb), desc, bh2, group_freed);
freed += group_freed;

if (overflow) {
@@ -247,7 +274,6 @@
error_return:
brelse(bitmap_bh);
release_blocks(sb, freed);
- unlock_super (sb);
DQUOT_FREE_BLOCK(inode, freed);
}

@@ -258,6 +284,8 @@

if (!ext2_test_bit(goal, map))
goto got_it;
+
+repeat:
if (goal) {
/*
* The goal was occupied; search forward for a free
@@ -297,7 +325,8 @@
}
return -1;
got_it:
- ext2_set_bit(goal, map);
+ if (test_and_set_bit(goal, (void *) map))
+ goto repeat;
return goal;
}

@@ -342,8 +371,6 @@

dq_alloc = prealloc_goal + 1;

- lock_super (sb);
-
es_alloc = reserve_blocks(sb, dq_alloc);
if (!es_alloc) {
*err = -ENOSPC;
@@ -360,7 +387,7 @@
if (!desc)
goto io_error;

- group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+ group_alloc = group_reserve_blocks(sbi, desc, gdp_bh, es_alloc);
if (group_alloc) {
ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
group_size);
@@ -375,7 +402,7 @@
group_size, ret_block);
if (ret_block >= 0)
goto got_block;
- group_release_blocks(desc, gdp_bh, group_alloc);
+ group_release_blocks(sbi, desc, gdp_bh, group_alloc);
group_alloc = 0;
}

@@ -393,7 +420,7 @@
desc = ext2_get_group_desc(sb, group_no, &gdp_bh);
if (!desc)
goto io_error;
- group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+ group_alloc = group_reserve_blocks(sbi, desc, gdp_bh, es_alloc);
}
if (bit >= sbi->s_groups_count) {
*err = -ENOSPC;
@@ -452,7 +479,7 @@
unsigned n;

for (n = 0; n < group_alloc && ++ret_block < group_size; n++) {
- if (ext2_set_bit(ret_block, bitmap_bh->b_data))
+ if (test_and_set_bit(ret_block, (void *) bitmap_bh->b_data))
break;
}
*prealloc_block = block + 1;
@@ -471,10 +498,9 @@

*err = 0;
out_release:
- group_release_blocks(desc, gdp_bh, group_alloc);
+ group_release_blocks(sbi, desc, gdp_bh, group_alloc);
release_blocks(sb, es_alloc);
out_unlock:
- unlock_super (sb);
DQUOT_FREE_BLOCK(inode, dq_alloc);
out:
brelse(bitmap_bh);
diff -uNr linux/fs/ext2/super.c edited/fs/ext2/super.c
--- linux/fs/ext2/super.c Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/super.c Wed Mar 12 23:29:53 2003
@@ -564,6 +564,7 @@
return -ENOMEM;
sb->s_fs_info = sbi;
memset(sbi, 0, sizeof(*sbi));
+ spin_lock_init(&sbi->s_alloc_lock);

/*
* See what the current blocksize for the device is, and
diff -uNr linux/include/linux/ext2_fs_sb.h edited/include/linux/ext2_fs_sb.h
--- linux/include/linux/ext2_fs_sb.h Mon Nov 11 06:28:30 2002
+++ edited/include/linux/ext2_fs_sb.h Wed Mar 12 22:57:30 2003
@@ -45,6 +45,7 @@
u32 s_next_generation;
unsigned long s_dir_count;
u8 *s_debts;
+ spinlock_t s_alloc_lock;
};

#endif /* _LINUX_EXT2_FS_SB */


2003-03-13 09:46:52

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

Alex Tomas <[email protected]> wrote:
>
>
> Hi!
>
> as Andrew said, concurrent balloc for ext3 is useless because of BKL.
> and I saw it in benchmarks. but it may be useful for ext2.
>
> Results:
> 9/100000 9/500000 16/100000 16/500000 32/100000 32/500000
> ext2: 0m9.260s 0m46.160s 0m18.133s 1m33.553s 0m35.958s 3m4.164s
> ext2-ca: 0m8.578s 0m42.712s 0m17.412s 1m28.637s 0m33.736s 2m53.824s
>
> in those benchmarks, I run 2 process, each of them writes N blocks
> (9, 16, 32), truncates file and repeat these steps M times (100000, 500000).

OK. The main gain here is from the large context switch rate which
lock_super() can cause on big machines.

> - if (!ext2_clear_bit(bit + i, bitmap_bh->b_data))
> + if (!test_and_clear_bit(bit + i, (void *) bitmap_bh->b_data))

Nope.

This is an on-disk bitmap. ext2_clear_bit() is endian-neutral - see the
ppc/ppc64/mips/etc implementations. The code you have here will not work on
big-endian architectures.

We either need to create per-architecture atomic implementations of
ext2_foo_bit(), or use the existing ones under spinlock.

You could do:

int bzzz_set_bit(struct ext2_bg_info *bgi, void *addr, int bit)
{
#if __BIG_ENDIAN
int ret;

spin_lock(&bgi->s_alloc_lock);
ret = ext2_set_bit(addr, bit);
spin_unlock(&bgi->s_alloc_lock);
return ret;
#else
return test_and_set_bit(addr, bit);
#endif
}

I think that will work...

> @@ -45,6 +45,7 @@
> u32 s_next_generation;
> unsigned long s_dir_count;
> u8 *s_debts;
> + spinlock_t s_alloc_lock;
> };

You can do better than this. A spinlock per blockgroup will scale better,
and is pretty easy.

See that s_debts thing? That points to an array of bytes, one per
blockgroup. Turn it into:

struct ext2_bg_info {
u8 s_debt;
spinlock_t s_alloc_lock;
};

And the locking can become per-blockgroup.

The problem with this is the fs-wide s_free_blocks_count thing. It needs
global locking. But do we need it?

If you look, you'll see that's not really used for much. When we report the
free block count to userspace you can just locklesly zoom across all the
blockgroups adding them up. You'll have to do the same in
find_group_orlov(), which is a bit sucky, but that's only used by mkdir.

The only thing left which needs the global free blocks counter is the
"reserved blocks for root" thing, which doesn't work very well anyway. A way
to fix that would be to add a "reserved to root" field to ext2_bg_info, and
to precalculate these at mount time.

So the mount code walks across the blockgroups reserving blocks in each one
until it has reserved the required number of blocks. This way the for-root
reservation becomes per-block-group. It should only be dipped into if all
blockgroups are otherwise full.

Or something like that ;)


2003-03-13 17:29:29

by Andreas Dilger

[permalink] [raw]
Subject: Re: [Ext2-devel] [PATCH] concurrent block allocation for ext2 against 2.5.64

On Mar 13, 2003 11:55 +0300, Alex Tomas wrote:
> as Andrew said, concurrent balloc for ext3 is useless because of BKL.
> and I saw it in benchmarks. but it may be useful for ext2.

Sadly, we are constantly diverging the ext2/ext3 codebases. Lots of
features are going into ext3, but lots of fixes/improvements are only
going into ext2. Is ext3 holding BKL for doing journal_start() still?

Looking at ext3_prepare_write() we grab the BKL for doing journal_start()
and for journal_stop(), but I don't _think_ we need BKL for journal_stop()
do we? We may or may not need it for the journal_data case, but that is
not even working right now I think.

It also seems we are getting BKL in ext3_truncate(), which likely isn't
needed past journal_start(), although we do need to have superblock-only
lock for ext3_orphan_add/del.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2003-03-13 18:40:20

by Alex Tomas

[permalink] [raw]
Subject: Re: [Ext2-devel] [PATCH] concurrent block allocation for ext2 against 2.5.64


fs/attr.c:
if (ia_valid & ATTR_SIZE) {
if (attr->ia_size == inode->i_size) {
if (ia_valid == ATTR_SIZE)
goto out; /* we can skip lock_kernel() */
} else {
lock_kernel();
error = vmtruncate(inode, attr->ia_size);
unlock_kernel();
if (error)
goto out;
}
}

so, all (!) truncates are serialized

>>>>> Andreas Dilger (AD) writes:

AD> On Mar 13, 2003 11:55 +0300, Alex Tomas wrote:
>> as Andrew said, concurrent balloc for ext3 is useless because of
>> BKL. and I saw it in benchmarks. but it may be useful for ext2.

AD> Sadly, we are constantly diverging the ext2/ext3 codebases. Lots
AD> of features are going into ext3, but lots of fixes/improvements
AD> are only going into ext2. Is ext3 holding BKL for doing
AD> journal_start() still?

AD> Looking at ext3_prepare_write() we grab the BKL for doing
AD> journal_start() and for journal_stop(), but I don't _think_ we
AD> need BKL for journal_stop() do we? We may or may not need it for
AD> the journal_data case, but that is not even working right now I
AD> think.

AD> It also seems we are getting BKL in ext3_truncate(), which likely
AD> isn't needed past journal_start(), although we do need to have
AD> superblock-only lock for ext3_orphan_add/del.



2003-03-13 18:58:34

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [Ext2-devel] [PATCH] concurrent block allocation for ext2 against 2.5.64

On Thu, Mar 13, 2003 at 09:43:05PM +0300, Alex Tomas wrote:
>
> fs/attr.c:
> if (ia_valid & ATTR_SIZE) {
> if (attr->ia_size == inode->i_size) {
> if (ia_valid == ATTR_SIZE)
> goto out; /* we can skip lock_kernel() */
> } else {
> lock_kernel();
> error = vmtruncate(inode, attr->ia_size);
> unlock_kernel();
> if (error)
> goto out;
> }
> }
>
> so, all (!) truncates are serialized

This looks like a bug. It should be safe to delete them. Rationale:

- Documentation/filesystems/Locking says ->truncate is called without the BKL.
- This isn't the only place vmtruncate() is called. Several of the callers
do it without the BKL (eg xfs, cifs).
- vmtruncate() appears to handle its own locking (mapping->i_shared_sem)

Comments?

--
"It's not Hollywood. War is real, war is primarily not about defeat or
victory, it is about death. I've seen thousands and thousands of dead bodies.
Do you think I want to have an academic debate on this subject?" -- Robert Fisk

2003-03-13 19:13:34

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [Ext2-devel] [PATCH] concurrent block allocation for ext2 against 2.5.64

On Thu, Mar 13, 2003 at 10:39:48AM -0700, Andreas Dilger wrote:
> Sadly, we are constantly diverging the ext2/ext3 codebases. Lots of
> features are going into ext3, but lots of fixes/improvements are only
> going into ext2. Is ext3 holding BKL for doing journal_start() still?
>
> Looking at ext3_prepare_write() we grab the BKL for doing journal_start()
> and for journal_stop(), but I don't _think_ we need BKL for journal_stop()
> do we? We may or may not need it for the journal_data case, but that is
> not even working right now I think.

We badly need to remove the BKL from ext3; it's the source of massive
performance problems for ext3 on larger machines.

Stephen, you were telling me a week or two ago that there were some
subtle issues involved with BKL removal from the jbd layer --- could
you give us a quick summary of what landminds are there for whoever
wants to try to tackle the ext3/jbd BKL removal?

- Ted

2003-03-13 19:15:32

by Alex Tomas

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64


hi!

here is the new version of the patch. changes since the last one:
1) new primitives ext2_set_bit_atomic and ext2_clear_bit_atomic have been introduced.
primitives have additional parameter spinlock *, defined for every arch. each arch
should use atomic test_and_set_bit/test_and_clear_bit or use ext2_set_bit and
ext2_clear_bit serialized by this lock
2) each group has own spinlock, which is used for group counter modifications and may
be used to implement ext2_set_bit_atomic/ext2_clear_bit_atomic
3) sb->s_free_blocks_count isn't used any more. ext2_statfs() and find_group_orlov()
loop over groups to count free blocks
4) sb->s_free_blocks_count is recalculated at mount/umount/sync_super time in order
to check consistency and to avoid fsck warnings
5) reserved blocks are distributed over all groups at mount time
6) ext2_new_block() tries to use non-reserved blocks and if it fails then tries to
use reserved blocks
7) ext2_new_block() and ext2_free_blocks do not modify sb->s_free_blocks, therefore
they do not call mark_buffer_dirty() for superblock's buffer_head. I think it
may reduce I/O a bit

Thanks to Andrew for idea.


diff -uNr linux/fs/ext2/balloc.c edited/fs/ext2/balloc.c
--- linux/fs/ext2/balloc.c Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/balloc.c Thu Mar 13 21:20:16 2003
@@ -94,69 +94,62 @@
return bh;
}

-static inline int reserve_blocks(struct super_block *sb, int count)
+static inline int group_reserve_blocks(struct ext2_sb_info *sbi, struct ext2_bg_info *bgi,
+ struct ext2_group_desc *desc,
+ struct buffer_head *bh, int count, int use_reserve)
{
- struct ext2_sb_info * sbi = EXT2_SB(sb);
- struct ext2_super_block * es = sbi->s_es;
- unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
- unsigned root_blocks = le32_to_cpu(es->s_r_blocks_count);
+ unsigned free_blocks;
+ unsigned root_blocks;

+ spin_lock(&bgi->alloc_lock);
+
+ free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
if (free_blocks < count)
count = free_blocks;
+ root_blocks = bgi->reserved;

- if (free_blocks < root_blocks + count && !capable(CAP_SYS_RESOURCE) &&
- sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
- /*
- * We are too close to reserve and we are not privileged.
- * Can we allocate anything at all?
- */
- if (free_blocks > root_blocks)
- count = free_blocks - root_blocks;
- else
- return 0;
+ if (free_blocks < root_blocks && !use_reserve) {
+ /* don't use reserved blocks */
+ spin_unlock(&bgi->alloc_lock);
+ return 0;
}
-
- es->s_free_blocks_count = cpu_to_le32(free_blocks - count);
- mark_buffer_dirty(sbi->s_sbh);
- sb->s_dirt = 1;
- return count;
-}
-
-static inline void release_blocks(struct super_block *sb, int count)
-{
- if (count) {
- struct ext2_sb_info * sbi = EXT2_SB(sb);
- struct ext2_super_block * es = sbi->s_es;
- unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
- es->s_free_blocks_count = cpu_to_le32(free_blocks + count);
- mark_buffer_dirty(sbi->s_sbh);
- sb->s_dirt = 1;
+
+ if (free_blocks < root_blocks + count && !capable(CAP_SYS_RESOURCE) &&
+ sbi->s_resuid != current->fsuid &&
+ (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+ /*
+ * We are too close to reserve and we are not privileged.
+ * Can we allocate anything at all?
+ */
+ if (free_blocks > root_blocks)
+ count = free_blocks - root_blocks;
+ else {
+ spin_unlock(&bgi->alloc_lock);
+ return 0;
+ }
}
-}
-
-static inline int group_reserve_blocks(struct ext2_group_desc *desc,
- struct buffer_head *bh, int count)
-{
- unsigned free_blocks;
-
- if (!desc->bg_free_blocks_count)
- return 0;
-
- free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
- if (free_blocks < count)
- count = free_blocks;
desc->bg_free_blocks_count = cpu_to_le16(free_blocks - count);
+
+ spin_unlock(&bgi->alloc_lock);
+
mark_buffer_dirty(bh);
return count;
}

-static inline void group_release_blocks(struct ext2_group_desc *desc,
- struct buffer_head *bh, int count)
+static inline void group_release_blocks(struct ext2_bg_info *bgi,
+ struct ext2_group_desc *desc,
+ struct buffer_head *bh, int count)
{
if (count) {
- unsigned free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
+ unsigned free_blocks;
+
+ spin_lock(&bgi->alloc_lock);
+
+ free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count);
+
+ spin_unlock(&bgi->alloc_lock);
+
mark_buffer_dirty(bh);
}
}
@@ -172,12 +165,11 @@
unsigned long i;
unsigned long overflow;
struct super_block * sb = inode->i_sb;
+ struct ext2_sb_info * sbi = EXT2_SB(sb);
struct ext2_group_desc * desc;
- struct ext2_super_block * es;
+ struct ext2_super_block * es = sbi->s_es;
unsigned freed = 0, group_freed;

- lock_super (sb);
- es = EXT2_SB(sb)->s_es;
if (block < le32_to_cpu(es->s_first_data_block) ||
block + count < block ||
block + count > le32_to_cpu(es->s_blocks_count)) {
@@ -215,16 +207,17 @@
if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
in_range (block, le32_to_cpu(desc->bg_inode_table),
- EXT2_SB(sb)->s_itb_per_group) ||
+ sbi->s_itb_per_group) ||
in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
- EXT2_SB(sb)->s_itb_per_group))
+ sbi->s_itb_per_group))
ext2_error (sb, "ext2_free_blocks",
"Freeing blocks in system zones - "
"Block = %lu, count = %lu",
block, count);

for (i = 0, group_freed = 0; i < count; i++) {
- if (!ext2_clear_bit(bit + i, bitmap_bh->b_data))
+ if (!ext2_clear_bit_atomic(&sbi->s_bgi[block_group].alloc_lock,
+ bit + i, (void *) bitmap_bh->b_data))
ext2_error (sb, "ext2_free_blocks",
"bit already cleared for block %lu",
block + i);
@@ -236,7 +229,7 @@
if (sb->s_flags & MS_SYNCHRONOUS)
sync_dirty_buffer(bitmap_bh);

- group_release_blocks(desc, bh2, group_freed);
+ group_release_blocks(&sbi->s_bgi[block_group], desc, bh2, group_freed);
freed += group_freed;

if (overflow) {
@@ -246,18 +239,18 @@
}
error_return:
brelse(bitmap_bh);
- release_blocks(sb, freed);
- unlock_super (sb);
DQUOT_FREE_BLOCK(inode, freed);
}

-static int grab_block(char *map, unsigned size, int goal)
+static int grab_block(spinlock_t *lock, char *map, unsigned size, int goal)
{
int k;
char *p, *r;

if (!ext2_test_bit(goal, map))
goto got_it;
+
+repeat:
if (goal) {
/*
* The goal was occupied; search forward for a free
@@ -297,7 +290,8 @@
}
return -1;
got_it:
- ext2_set_bit(goal, map);
+ if (ext2_set_bit_atomic(lock, goal, (void *) map))
+ goto repeat;
return goal;
}

@@ -319,7 +313,7 @@
int ret_block; /* j */
int bit; /* k */
int target_block; /* tmp */
- int block = 0;
+ int block = 0, use_reserve = 0;
struct super_block *sb = inode->i_sb;
struct ext2_sb_info *sbi = EXT2_SB(sb);
struct ext2_super_block *es = sbi->s_es;
@@ -341,14 +335,7 @@
prealloc_goal--;

dq_alloc = prealloc_goal + 1;
-
- lock_super (sb);
-
- es_alloc = reserve_blocks(sb, dq_alloc);
- if (!es_alloc) {
- *err = -ENOSPC;
- goto out_unlock;
- }
+ es_alloc = dq_alloc;

ext2_debug ("goal=%lu.\n", goal);

@@ -360,7 +347,8 @@
if (!desc)
goto io_error;

- group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+ group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no],
+ desc, gdp_bh, es_alloc, 0);
if (group_alloc) {
ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
group_size);
@@ -371,11 +359,12 @@

ext2_debug("goal is at %d:%d.\n", group_no, ret_block);

- ret_block = grab_block(bitmap_bh->b_data,
+ ret_block = grab_block(&sbi->s_bgi[group_no].alloc_lock,
+ bitmap_bh->b_data,
group_size, ret_block);
if (ret_block >= 0)
goto got_block;
- group_release_blocks(desc, gdp_bh, group_alloc);
+ group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc);
group_alloc = 0;
}

@@ -385,6 +374,7 @@
* Now search the rest of the groups. We assume that
* i and desc correctly point to the last group visited.
*/
+repeat:
for (bit = 0; !group_alloc &&
bit < sbi->s_groups_count; bit++) {
group_no++;
@@ -393,7 +383,16 @@
desc = ext2_get_group_desc(sb, group_no, &gdp_bh);
if (!desc)
goto io_error;
- group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+ group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no],
+ desc, gdp_bh, es_alloc, use_reserve);
+ }
+ if (!use_reserve) {
+ /* first time we did not try to allocate
+ * reserved blocks. now it looks like
+ * no more non-reserved blocks left. we
+ * will try to allocate reserved blocks -bzzz */
+ use_reserve = 1;
+ goto repeat;
}
if (bit >= sbi->s_groups_count) {
*err = -ENOSPC;
@@ -404,13 +403,11 @@
if (!bitmap_bh)
goto io_error;

- ret_block = grab_block(bitmap_bh->b_data, group_size, 0);
+ ret_block = grab_block(&sbi->s_bgi[group_no].alloc_lock,
+ bitmap_bh->b_data, group_size, 0);
if (ret_block < 0) {
- ext2_error (sb, "ext2_new_block",
- "Free blocks count corrupted for block group %d",
- group_no);
group_alloc = 0;
- goto io_error;
+ goto repeat;
}

got_block:
@@ -452,7 +449,8 @@
unsigned n;

for (n = 0; n < group_alloc && ++ret_block < group_size; n++) {
- if (ext2_set_bit(ret_block, bitmap_bh->b_data))
+ if (ext2_set_bit_atomic(&sbi->s_bgi[group_no].alloc_lock,
+ ret_block, (void*) bitmap_bh->b_data))
break;
}
*prealloc_block = block + 1;
@@ -471,10 +469,7 @@

*err = 0;
out_release:
- group_release_blocks(desc, gdp_bh, group_alloc);
- release_blocks(sb, es_alloc);
-out_unlock:
- unlock_super (sb);
+ group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc);
DQUOT_FREE_BLOCK(inode, dq_alloc);
out:
brelse(bitmap_bh);
@@ -487,11 +482,11 @@

unsigned long ext2_count_free_blocks (struct super_block * sb)
{
-#ifdef EXT2FS_DEBUG
- struct ext2_super_block * es;
- unsigned long desc_count, bitmap_count, x;
struct ext2_group_desc * desc;
+ unsigned long desc_count = 0;
int i;
+#ifdef EXT2FS_DEBUG
+ unsigned long bitmap_count, x;

lock_super (sb);
es = EXT2_SB(sb)->s_es;
@@ -519,7 +514,13 @@
unlock_super (sb);
return bitmap_count;
#else
- return le32_to_cpu(EXT2_SB(sb)->s_es->s_free_blocks_count);
+ for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+ desc = ext2_get_group_desc (sb, i, NULL);
+ if (!desc)
+ continue;
+ desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+ }
+ return desc_count;
#endif
}

diff -uNr linux/fs/ext2/ialloc.c edited/fs/ext2/ialloc.c
--- linux/fs/ext2/ialloc.c Mon Mar 10 14:52:34 2003
+++ edited/fs/ext2/ialloc.c Thu Mar 13 20:08:58 2003
@@ -278,7 +278,8 @@
int ngroups = sbi->s_groups_count;
int inodes_per_group = EXT2_INODES_PER_GROUP(sb);
int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups;
- int avefreeb = le32_to_cpu(es->s_free_blocks_count) / ngroups;
+ int free_blocks = ext2_count_free_blocks(sb);
+ int avefreeb = free_blocks / ngroups;
int blocks_per_dir;
int ndirs = sbi->s_dir_count;
int max_debt, max_dirs, min_blocks, min_inodes;
@@ -320,8 +321,7 @@
goto fallback;
}

- blocks_per_dir = (le32_to_cpu(es->s_blocks_count) -
- le32_to_cpu(es->s_free_blocks_count)) / ndirs;
+ blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - free_blocks) / ndirs;

max_dirs = ndirs / ngroups + inodes_per_group / 16;
min_inodes = avefreei - inodes_per_group / 4;
@@ -340,7 +340,7 @@
desc = ext2_get_group_desc (sb, group, &bh);
if (!desc || !desc->bg_free_inodes_count)
continue;
- if (sbi->s_debts[group] >= max_debt)
+ if (sbi->s_bgi[group].debts >= max_debt)
continue;
if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
continue;
@@ -501,11 +501,11 @@
cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);

if (S_ISDIR(mode)) {
- if (EXT2_SB(sb)->s_debts[group] < 255)
- EXT2_SB(sb)->s_debts[group]++;
+ if (EXT2_SB(sb)->s_bgi[group].debts < 255)
+ EXT2_SB(sb)->s_bgi[group].debts++;
} else {
- if (EXT2_SB(sb)->s_debts[group])
- EXT2_SB(sb)->s_debts[group]--;
+ if (EXT2_SB(sb)->s_bgi[group].debts)
+ EXT2_SB(sb)->s_bgi[group].debts--;
}

mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
diff -uNr linux/fs/ext2/super.c edited/fs/ext2/super.c
--- linux/fs/ext2/super.c Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/super.c Thu Mar 13 17:34:35 2003
@@ -141,7 +141,7 @@
if (sbi->s_group_desc[i])
brelse (sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
- kfree(sbi->s_debts);
+ kfree(sbi->s_bgi);
brelse (sbi->s_sbh);
sb->s_fs_info = NULL;
kfree(sbi);
@@ -464,8 +464,11 @@
int i;
int desc_block = 0;
struct ext2_sb_info *sbi = EXT2_SB(sb);
- unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+ struct ext2_super_block * es = sbi->s_es;
+ unsigned long block = le32_to_cpu(es->s_first_data_block);
struct ext2_group_desc * gdp = NULL;
+ unsigned int total_free = 0;
+ unsigned int reserved = le32_to_cpu(es->s_r_blocks_count);

ext2_debug ("Checking group descriptors");

@@ -504,6 +507,41 @@
block += EXT2_BLOCKS_PER_GROUP(sb);
gdp++;
}
+
+ /* restore free blocks counter in SB -bzzz */
+ total_free = ext2_count_free_blocks(sb);
+ if (le32_to_cpu(es->s_free_blocks_count) != total_free)
+ printk(KERN_INFO "EXT2-fs: last umount wasn't clean. correct free blocks counter\n");
+ es->s_free_blocks_count = cpu_to_le32(total_free);
+
+ /* distribute reserved blocks over groups -bzzz */
+ while (reserved && total_free) {
+ unsigned int per_group = reserved / sbi->s_groups_count + 1;
+ unsigned int free;
+
+ for (i = 0; reserved && i < sbi->s_groups_count; i++) {
+ gdp = ext2_get_group_desc (sb, i, NULL);
+ if (!gdp) {
+ ext2_error (sb, "ext2_check_descriptors",
+ "can't get descriptor for group #%d", i);
+ return 0;
+ }
+
+ free = le16_to_cpu(gdp->bg_free_blocks_count);
+ if (per_group > free)
+ per_group = free;
+ if (per_group > reserved)
+ per_group = reserved;
+ sbi->s_bgi[i].reserved += per_group;
+ reserved -= per_group;
+ total_free -= per_group;
+
+ /* correct per group aproximation */
+ if (i < sbi->s_groups_count - i)
+ per_group = reserved / (sbi->s_groups_count - i - 1) + 1;
+ }
+ }
+
return 1;
}

@@ -768,13 +806,17 @@
printk ("EXT2-fs: not enough memory\n");
goto failed_mount;
}
- sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
+ sbi->s_bgi = kmalloc(sbi->s_groups_count*sizeof(struct ext2_bg_info),
GFP_KERNEL);
- if (!sbi->s_debts) {
+ if (!sbi->s_bgi) {
printk ("EXT2-fs: not enough memory\n");
goto failed_mount_group_desc;
}
- memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(*sbi->s_debts));
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ sbi->s_bgi[i].debts = 0;
+ sbi->s_bgi[i].reserved = 0;
+ spin_lock_init(&sbi->s_bgi[i].alloc_lock);
+ }
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logic_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
@@ -820,8 +862,8 @@
brelse(sbi->s_group_desc[i]);
failed_mount_group_desc:
kfree(sbi->s_group_desc);
- if (sbi->s_debts)
- kfree(sbi->s_debts);
+ if (sbi->s_bgi)
+ kfree(sbi->s_bgi);
failed_mount:
brelse(bh);
failed_sbi:
@@ -840,6 +882,7 @@

static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
{
+ es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
es->s_wtime = cpu_to_le32(get_seconds());
mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
@@ -868,6 +911,7 @@
ext2_debug ("setting valid to 0\n");
es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) &
~EXT2_VALID_FS);
+ es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
es->s_mtime = cpu_to_le32(get_seconds());
ext2_sync_super(sb, es);
} else
@@ -929,7 +973,8 @@
static int ext2_statfs (struct super_block * sb, struct statfs * buf)
{
struct ext2_sb_info *sbi = EXT2_SB(sb);
- unsigned long overhead;
+ unsigned long overhead, total_free = 0;
+ struct ext2_group_desc *desc;
int i;

if (test_opt (sb, MINIX_DF))
@@ -950,9 +995,14 @@
* block group descriptors. If the sparse superblocks
* feature is turned on, then not all groups have this.
*/
- for (i = 0; i < sbi->s_groups_count; i++)
+ for (i = 0; i < sbi->s_groups_count; i++) {
overhead += ext2_bg_has_super(sb, i) +
ext2_bg_num_gdb(sb, i);
+
+ /* sum total free blocks -bzzz */
+ desc = ext2_get_group_desc (sb, i, NULL);
+ total_free += le16_to_cpu(desc->bg_free_blocks_count);
+ }

/*
* Every block group has an inode bitmap, a block
@@ -965,7 +1015,7 @@
buf->f_type = EXT2_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = le32_to_cpu(sbi->s_es->s_blocks_count) - overhead;
- buf->f_bfree = ext2_count_free_blocks (sb);
+ buf->f_bfree = total_free;
buf->f_bavail = buf->f_bfree - le32_to_cpu(sbi->s_es->s_r_blocks_count);
if (buf->f_bfree < le32_to_cpu(sbi->s_es->s_r_blocks_count))
buf->f_bavail = 0;
diff -uNr linux/include/asm-alpha/bitops.h edited/include/asm-alpha/bitops.h
--- linux/include/asm-alpha/bitops.h Mon Mar 10 14:52:36 2003
+++ edited/include/asm-alpha/bitops.h Thu Mar 13 14:10:18 2003
@@ -487,7 +487,9 @@


#define ext2_set_bit __test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a)
#define ext2_clear_bit __test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
#define ext2_test_bit test_bit
#define ext2_find_first_zero_bit find_first_zero_bit
#define ext2_find_next_zero_bit find_next_zero_bit
diff -uNr linux/include/asm-arm/bitops.h edited/include/asm-arm/bitops.h
--- linux/include/asm-arm/bitops.h Mon Mar 10 14:52:36 2003
+++ edited/include/asm-arm/bitops.h Thu Mar 13 14:10:46 2003
@@ -357,8 +357,12 @@
*/
#define ext2_set_bit(nr,p) \
__test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
+#define ext2_set_bit_atomic(lock,nr,p) \
+ test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
#define ext2_clear_bit(nr,p) \
__test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
+#define ext2_clear_bit_atomic(lock,nr,p) \
+ test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
#define ext2_test_bit(nr,p) \
__test_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
#define ext2_find_first_zero_bit(p,sz) \
diff -uNr linux/include/asm-cris/bitops.h edited/include/asm-cris/bitops.h
--- linux/include/asm-cris/bitops.h Mon Nov 11 06:28:30 2002
+++ edited/include/asm-cris/bitops.h Thu Mar 13 14:11:15 2003
@@ -360,7 +360,9 @@
#define hweight8(x) generic_hweight8(x)

#define ext2_set_bit test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a)
#define ext2_clear_bit test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
#define ext2_test_bit test_bit
#define ext2_find_first_zero_bit find_first_zero_bit
#define ext2_find_next_zero_bit find_next_zero_bit
diff -uNr linux/include/asm-i386/bitops.h edited/include/asm-i386/bitops.h
--- linux/include/asm-i386/bitops.h Wed Dec 25 06:03:08 2002
+++ edited/include/asm-i386/bitops.h Thu Mar 13 14:11:32 2003
@@ -479,8 +479,12 @@

#define ext2_set_bit(nr,addr) \
__test_and_set_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock,nr,addr) \
+ test_and_set_bit((nr),(unsigned long*)addr)
#define ext2_clear_bit(nr, addr) \
__test_and_clear_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock,nr, addr) \
+ test_and_clear_bit((nr),(unsigned long*)addr)
#define ext2_test_bit(nr, addr) test_bit((nr),(unsigned long*)addr)
#define ext2_find_first_zero_bit(addr, size) \
find_first_zero_bit((unsigned long*)addr, size)
diff -uNr linux/include/asm-ia64/bitops.h edited/include/asm-ia64/bitops.h
--- linux/include/asm-ia64/bitops.h Thu Feb 20 16:18:21 2003
+++ edited/include/asm-ia64/bitops.h Thu Mar 13 14:12:50 2003
@@ -453,7 +453,9 @@
#define __clear_bit(nr, addr) clear_bit(nr, addr)

#define ext2_set_bit test_and_set_bit
+#define ext2_set_atomic(l,n,a) test_and_set_bit(n,a)
#define ext2_clear_bit test_and_clear_bit
+#define ext2_clear_atomic(l,n,a) test_and_clear_bit(n,a)
#define ext2_test_bit test_bit
#define ext2_find_first_zero_bit find_first_zero_bit
#define ext2_find_next_zero_bit find_next_zero_bit
diff -uNr linux/include/asm-m68k/bitops.h edited/include/asm-m68k/bitops.h
--- linux/include/asm-m68k/bitops.h Mon Nov 11 06:28:33 2002
+++ edited/include/asm-m68k/bitops.h Thu Mar 13 14:15:31 2003
@@ -355,6 +355,16 @@
}

extern __inline__ int
+ext2_set_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, vaddr);
+ spin_unlock(lock);
+ return ret;
+}
+
+extern __inline__ int
ext2_clear_bit (int nr, volatile void *vaddr)
{
char retval;
@@ -366,6 +376,16 @@
}

extern __inline__ int
+ext2_clear_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, vaddr);
+ spin_unlock(lock);
+ return ret;
+}
+
+extern __inline__ int
ext2_test_bit (int nr, const volatile void *vaddr)
{
return ((1U << (nr & 7)) & (((const volatile unsigned char *) vaddr)[nr >> 3])) != 0;
diff -uNr linux/include/asm-m68knommu/bitops.h edited/include/asm-m68knommu/bitops.h
--- linux/include/asm-m68knommu/bitops.h Mon Nov 11 06:28:04 2002
+++ edited/include/asm-m68knommu/bitops.h Thu Mar 13 14:18:21 2003
@@ -387,6 +387,16 @@
return retval;
}

+extern __inline__ int ext2_set_bit_atomic(spinlock_t *lock, int nr,
+ volatile void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int ext2_clear_bit(int nr, volatile void * addr)
{
int mask, retval;
@@ -402,6 +412,16 @@
return retval;
}

+extern __inline__ int ext2_clear_bit_atomic(spinlock_t *lock, int nr,
+ volatile void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int ext2_test_bit(int nr, const volatile void * addr)
{
int mask;
diff -uNr linux/include/asm-mips/bitops.h edited/include/asm-mips/bitops.h
--- linux/include/asm-mips/bitops.h Mon Nov 11 06:28:03 2002
+++ edited/include/asm-mips/bitops.h Thu Mar 13 14:24:52 2003
@@ -810,6 +810,15 @@
return retval;
}

+extern __inline__ int ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int ext2_clear_bit(int nr, void * addr)
{
int mask, retval, flags;
@@ -824,6 +833,15 @@
return retval;
}

+extern __inline__ int ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int ext2_test_bit(int nr, const void * addr)
{
int mask;
@@ -890,7 +908,9 @@

/* Native ext2 byte ordering, just collapse using defines. */
#define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr))
+#define ext2_set_bit_atomic(lock,nr,addr) test_and_set_bit((nr), (addr))
#define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr))
+#define ext2_clear_bit_atomic(lock,nr,addr) test_and_clear_bit((nr), (addr))
#define ext2_test_bit(nr, addr) test_bit((nr), (addr))
#define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size))
#define ext2_find_next_zero_bit(addr, size, offset) \
diff -uNr linux/include/asm-mips64/bitops.h edited/include/asm-mips64/bitops.h
--- linux/include/asm-mips64/bitops.h Mon Nov 11 06:28:29 2002
+++ edited/include/asm-mips64/bitops.h Thu Mar 13 14:27:26 2003
@@ -517,6 +517,16 @@
}

extern inline int
+ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
+extern inline int
ext2_clear_bit(int nr, void * addr)
{
int mask, retval, flags;
@@ -532,6 +542,16 @@
}

extern inline int
+ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
+extern inline int
ext2_test_bit(int nr, const void * addr)
{
int mask;
@@ -599,7 +619,9 @@

/* Native ext2 byte ordering, just collapse using defines. */
#define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr))
+#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr), (addr))
#define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr), (addr))
#define ext2_test_bit(nr, addr) test_bit((nr), (addr))
#define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size))
#define ext2_find_next_zero_bit(addr, size, offset) \
diff -uNr linux/include/asm-parisc/bitops.h edited/include/asm-parisc/bitops.h
--- linux/include/asm-parisc/bitops.h Thu Feb 20 16:18:21 2003
+++ edited/include/asm-parisc/bitops.h Thu Mar 13 14:29:47 2003
@@ -389,10 +389,14 @@
*/
#ifdef __LP64__
#define ext2_set_bit(nr, addr) test_and_set_bit((nr) ^ 0x38, addr)
+#define ext2_set_bit_atomic(l,nr,addr) test_and_set_bit((nr) ^ 0x38, addr)
#define ext2_clear_bit(nr, addr) test_and_clear_bit((nr) ^ 0x38, addr)
+#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x38, addr)
#else
#define ext2_set_bit(nr, addr) test_and_set_bit((nr) ^ 0x18, addr)
+#define ext2_set_bit_atomic(l,nr,addr) test_and_set_bit((nr) ^ 0x18, addr)
#define ext2_clear_bit(nr, addr) test_and_clear_bit((nr) ^ 0x18, addr)
+#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x18, addr)
#endif

#endif /* __KERNEL__ */
diff -uNr linux/include/asm-ppc/bitops.h edited/include/asm-ppc/bitops.h
--- linux/include/asm-ppc/bitops.h Mon Jan 20 05:23:05 2003
+++ edited/include/asm-ppc/bitops.h Thu Mar 13 14:31:00 2003
@@ -392,7 +392,9 @@


#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
+#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))

static __inline__ int ext2_test_bit(int nr, __const__ void * addr)
{
diff -uNr linux/include/asm-ppc64/bitops.h edited/include/asm-ppc64/bitops.h
--- linux/include/asm-ppc64/bitops.h Mon Nov 11 06:28:28 2002
+++ edited/include/asm-ppc64/bitops.h Thu Mar 13 14:32:23 2003
@@ -336,8 +336,12 @@

#define ext2_set_bit(nr,addr) \
__test_and_set_le_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock, nr,addr) \
+ test_and_set_le_bit((nr),(unsigned long*)addr)
#define ext2_clear_bit(nr, addr) \
__test_and_clear_le_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+ test_and_clear_le_bit((nr),(unsigned long*)addr)
#define ext2_test_bit(nr, addr) test_le_bit((nr),(unsigned long*)addr)
#define ext2_find_first_zero_bit(addr, size) \
find_first_zero_le_bit((unsigned long*)addr, size)
diff -uNr linux/include/asm-s390/bitops.h edited/include/asm-s390/bitops.h
--- linux/include/asm-s390/bitops.h Mon Mar 10 14:52:09 2003
+++ edited/include/asm-s390/bitops.h Thu Mar 13 14:33:55 2003
@@ -805,8 +805,12 @@

#define ext2_set_bit(nr, addr) \
test_and_set_bit((nr)^24, (unsigned long *)addr)
+#define ext2_set_bit_atomic(lock, nr, addr) \
+ test_and_set_bit((nr)^24, (unsigned long *)addr)
#define ext2_clear_bit(nr, addr) \
test_and_clear_bit((nr)^24, (unsigned long *)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+ test_and_clear_bit((nr)^24, (unsigned long *)addr)
#define ext2_test_bit(nr, addr) \
test_bit((nr)^24, (unsigned long *)addr)

diff -uNr linux/include/asm-s390x/bitops.h edited/include/asm-s390x/bitops.h
--- linux/include/asm-s390x/bitops.h Mon Mar 10 14:52:09 2003
+++ edited/include/asm-s390x/bitops.h Thu Mar 13 14:35:22 2003
@@ -838,8 +838,12 @@

#define ext2_set_bit(nr, addr) \
test_and_set_bit((nr)^56, (unsigned long *)addr)
+#define ext2_set_bit_atomic(lock, nr, addr) \
+ test_and_set_bit((nr)^56, (unsigned long *)addr)
#define ext2_clear_bit(nr, addr) \
test_and_clear_bit((nr)^56, (unsigned long *)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+ test_and_clear_bit((nr)^56, (unsigned long *)addr)
#define ext2_test_bit(nr, addr) \
test_bit((nr)^56, (unsigned long *)addr)

diff -uNr linux/include/asm-sh/bitops.h edited/include/asm-sh/bitops.h
--- linux/include/asm-sh/bitops.h Mon Nov 11 06:28:02 2002
+++ edited/include/asm-sh/bitops.h Thu Mar 13 14:37:18 2003
@@ -265,6 +265,16 @@
return retval;
}

+static __inline__ int ext2_set_bit_atomic(spinlock_t *lock,
+ int nr, volatile void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
static __inline__ int ext2_clear_bit(int nr, volatile void * addr)
{
int mask, retval;
@@ -280,6 +290,16 @@
return retval;
}

+static __inline__ int ext2_clear_bit_atomic(spinlock_t *lock,
+ int nr, volatile void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
static __inline__ int ext2_test_bit(int nr, const volatile void * addr)
{
int mask;
diff -uNr linux/include/asm-sparc/bitops.h edited/include/asm-sparc/bitops.h
--- linux/include/asm-sparc/bitops.h Mon Jan 20 05:23:05 2003
+++ edited/include/asm-sparc/bitops.h Thu Mar 13 14:38:54 2003
@@ -454,7 +454,9 @@
find_next_zero_le_bit((addr), (size), 0)

#define ext2_set_bit __test_and_set_le_bit
+#define ext2_set_bit_atomic(l,n,a) test_and_set_le_bit(n,a)
#define ext2_clear_bit __test_and_clear_le_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_le_bit(n,a)
#define ext2_test_bit test_le_bit
#define ext2_find_first_zero_bit find_first_zero_le_bit
#define ext2_find_next_zero_bit find_next_zero_le_bit
diff -uNr linux/include/asm-sparc64/bitops.h edited/include/asm-sparc64/bitops.h
--- linux/include/asm-sparc64/bitops.h Mon Nov 11 06:28:05 2002
+++ edited/include/asm-sparc64/bitops.h Thu Mar 13 14:43:49 2003
@@ -351,7 +351,9 @@
#ifdef __KERNEL__

#define ext2_set_bit(nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr))
+#define ext2_set_bit_atomic(lock,nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr))
#define ext2_clear_bit(nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock,nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr))
#define ext2_test_bit(nr,addr) test_le_bit((nr),(unsigned long *)(addr))
#define ext2_find_first_zero_bit(addr, size) \
find_first_zero_le_bit((unsigned long *)(addr), (size))
diff -uNr linux/include/asm-v850/bitops.h edited/include/asm-v850/bitops.h
--- linux/include/asm-v850/bitops.h Mon Nov 11 06:28:02 2002
+++ edited/include/asm-v850/bitops.h Thu Mar 13 14:44:48 2003
@@ -252,7 +252,9 @@
#define hweight8(x) generic_hweight8 (x)

#define ext2_set_bit test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a)
#define ext2_clear_bit test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
#define ext2_test_bit test_bit
#define ext2_find_first_zero_bit find_first_zero_bit
#define ext2_find_next_zero_bit find_next_zero_bit
diff -uNr linux/include/asm-x86_64/bitops.h edited/include/asm-x86_64/bitops.h
--- linux/include/asm-x86_64/bitops.h Mon Mar 10 14:52:09 2003
+++ edited/include/asm-x86_64/bitops.h Thu Mar 13 14:45:56 2003
@@ -487,8 +487,12 @@

#define ext2_set_bit(nr,addr) \
__test_and_set_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock,nr,addr) \
+ test_and_set_bit((nr),(unsigned long*)addr)
#define ext2_clear_bit(nr, addr) \
__test_and_clear_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock,nr,addr) \
+ test_and_clear_bit((nr),(unsigned long*)addr)
#define ext2_test_bit(nr, addr) test_bit((nr),(unsigned long*)addr)
#define ext2_find_first_zero_bit(addr, size) \
find_first_zero_bit((unsigned long*)addr, size)
diff -uNr linux/include/linux/ext2_fs_sb.h edited/include/linux/ext2_fs_sb.h
--- linux/include/linux/ext2_fs_sb.h Mon Nov 11 06:28:30 2002
+++ edited/include/linux/ext2_fs_sb.h Thu Mar 13 15:56:52 2003
@@ -16,6 +16,12 @@
#ifndef _LINUX_EXT2_FS_SB
#define _LINUX_EXT2_FS_SB

+struct ext2_bg_info {
+ u8 debts;
+ spinlock_t alloc_lock;
+ unsigned int reserved;
+};
+
/*
* second extended-fs super-block data in memory
*/
@@ -44,7 +50,7 @@
int s_first_ino;
u32 s_next_generation;
unsigned long s_dir_count;
- u8 *s_debts;
+ struct ext2_bg_info *s_bgi;
};

#endif /* _LINUX_EXT2_FS_SB */


2003-03-13 19:29:16

by Andrew Morton

[permalink] [raw]
Subject: Re: [Ext2-devel] [PATCH] concurrent block allocation for ext2 against 2.5.64

Matthew Wilcox <[email protected]> wrote:
>
> On Thu, Mar 13, 2003 at 09:43:05PM +0300, Alex Tomas wrote:
> >
> > fs/attr.c:
> > if (ia_valid & ATTR_SIZE) {
> > if (attr->ia_size == inode->i_size) {
> > if (ia_valid == ATTR_SIZE)
> > goto out; /* we can skip lock_kernel() */
> > } else {
> > lock_kernel();
> > error = vmtruncate(inode, attr->ia_size);
> > unlock_kernel();
> > if (error)
> > goto out;
> > }
> > }
> >
> > so, all (!) truncates are serialized
>
> This looks like a bug. It should be safe to delete them.

Probably. I was running without them for months. But this is the
ftruncate() path and not the unlink() path, so I kinda forgot about it.

Most truncations are unlinks, and they are not under lock_kernel.

2003-03-13 19:34:02

by Andreas Dilger

[permalink] [raw]
Subject: Re: [Ext2-devel] [PATCH] concurrent block allocation for ext2 against 2.5.64

On Mar 13, 2003 14:23 -0500, Theodore Ts'o wrote:
> On Thu, Mar 13, 2003 at 10:39:48AM -0700, Andreas Dilger wrote:
> > Sadly, we are constantly diverging the ext2/ext3 codebases. Lots of
> > features are going into ext3, but lots of fixes/improvements are only
> > going into ext2. Is ext3 holding BKL for doing journal_start() still?
> >
> > Looking at ext3_prepare_write() we grab the BKL for doing journal_start()
> > and for journal_stop(), but I don't _think_ we need BKL for journal_stop()
> > do we? We may or may not need it for the journal_data case, but that is
> > not even working right now I think.
>
> We badly need to remove the BKL from ext3; it's the source of massive
> performance problems for ext3 on larger machines.
>
> Stephen, you were telling me a week or two ago that there were some
> subtle issues involved with BKL removal from the jbd layer --- could
> you give us a quick summary of what landminds are there for whoever
> wants to try to tackle the ext3/jbd BKL removal?

Ted, as a start, we can move the (un)lock_kernel() calls from the ext3
code into the journal_start() and journal_stop(), and then continue to
push it down into the places where we need it and/or replace it with a
better lock. This not only makes the lock migration easier, but also
ensures that we always have the lock when we need it.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2003-03-13 22:19:53

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

Alex Tomas <[email protected]> wrote:
>
>
> hi!
>
> here is the new version of the patch.

This is great work.

a) The algorithm which you are using to distribute the root-reserved
blocks across the blockgroups will end up leaving a small number of unused
blocks in every blockgroup. So large files which span multiple
blockgroups will have little gaps in them.

I think it's probably better to just lump all the root-reserved blocks
into as few blockgroups as possible.

Probably these should be the last blockgroups, because those are
nearest the spindle, and hence the slowest. This is by no means always
the case - some disks are backwards, but it seems that most are not. Plus
nearness to the superblock is good.

b) struct ext2_bg_info needs a ____cacheline_aligned_in_smp stuck on it.

c) It looks like EXT2FS_DEBUG broke. Nobody uses that much, but we should
fix and test it sometime.

Be expecting some benchmark numbers. Maybe those 32-ways will be able to run
as fast as my $300 2-way now ;)


2003-03-13 22:53:09

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

On Mar 13, 2003 14:25 -0800, Andrew Morton wrote:
> This is great work.

Agreed. This is something that has been talked about but not implemented
for a long time now. Thanks for the efforts.

> a) The algorithm which you are using to distribute the root-reserved
> blocks across the blockgroups will end up leaving a small number of unused
> blocks in every blockgroup. So large files which span multiple
> blockgroups will have little gaps in them.
>
> I think it's probably better to just lump all the root-reserved blocks
> into as few blockgroups as possible.

I might disagree here. One of the reasons for having the reserved blocks
is to prevent fragmentation, and not necessarily to reserve space for root.
For the lots of small files cases it makes more sense to leave free space
in each group to prevent fragmentation at the group level.

For the large file case, there is less need to worry about fragmentation,
so we can just ignore the group's reserved percentage for "large" files.
A heuristic which says "if this file is huge, just keep allocating from this
group, and screw the reserved blocks" makes sense.

One such heuristic is if the file is, say, larger than 1/2 or 1/4 of the
entire group in size, it is allowed to continue allocating from the same
group.

We could also say that for the purpose of allocating new files in a directory,
anything more than 95% full is "full" and the inode should be allocated in
a different group regardless of where the parent is. It may be that the
Orlov allocator already has such a heuristic, but I think that is a different
discussion.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2003-03-13 23:01:21

by Alex Tomas

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64


hi!

>>>>> Andrew Morton (AM) writes:

AM> a) The algorithm which you are using to distribute the
AM> root-reserved blocks across the blockgroups will end up leaving a
AM> small number of unused blocks in every blockgroup. So large
AM> files which span multiple blockgroups will have little gaps in
AM> them.

AM> I think it's probably better to just lump all the
AM> root-reserved blocks into as few blockgroups as possible.

AM> Probably these should be the last blockgroups, because those
AM> are nearest the spindle, and hence the slowest. This is by no
AM> means always the case - some disks are backwards, but it seems
AM> that most are not. Plus nearness to the superblock is good.

done

AM> b) struct ext2_bg_info needs a ____cacheline_aligned_in_smp stuck
AM> on it.

done

AM> c) It looks like EXT2FS_DEBUG broke. Nobody uses that much, but
AM> we should fix and test it sometime.

I suggest this to be fixed in separate patch. are you?

AM> Be expecting some benchmark numbers. Maybe those 32-ways will be
AM> able to run as fast as my $300 2-way now ;)

me too ;)


btw, what about minor bug in ext2 allocation code I posted recently?
do you agree it needs to be fixed?




diff -uNr linux/fs/ext2/balloc.c edited/fs/ext2/balloc.c
--- linux/fs/ext2/balloc.c Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/balloc.c Thu Mar 13 21:20:16 2003
@@ -94,69 +94,62 @@
return bh;
}

-static inline int reserve_blocks(struct super_block *sb, int count)
+static inline int group_reserve_blocks(struct ext2_sb_info *sbi, struct ext2_bg_info *bgi,
+ struct ext2_group_desc *desc,
+ struct buffer_head *bh, int count, int use_reserve)
{
- struct ext2_sb_info * sbi = EXT2_SB(sb);
- struct ext2_super_block * es = sbi->s_es;
- unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
- unsigned root_blocks = le32_to_cpu(es->s_r_blocks_count);
+ unsigned free_blocks;
+ unsigned root_blocks;

+ spin_lock(&bgi->alloc_lock);
+
+ free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
if (free_blocks < count)
count = free_blocks;
+ root_blocks = bgi->reserved;

- if (free_blocks < root_blocks + count && !capable(CAP_SYS_RESOURCE) &&
- sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
- /*
- * We are too close to reserve and we are not privileged.
- * Can we allocate anything at all?
- */
- if (free_blocks > root_blocks)
- count = free_blocks - root_blocks;
- else
- return 0;
+ if (free_blocks < root_blocks && !use_reserve) {
+ /* don't use reserved blocks */
+ spin_unlock(&bgi->alloc_lock);
+ return 0;
}
-
- es->s_free_blocks_count = cpu_to_le32(free_blocks - count);
- mark_buffer_dirty(sbi->s_sbh);
- sb->s_dirt = 1;
- return count;
-}
-
-static inline void release_blocks(struct super_block *sb, int count)
-{
- if (count) {
- struct ext2_sb_info * sbi = EXT2_SB(sb);
- struct ext2_super_block * es = sbi->s_es;
- unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
- es->s_free_blocks_count = cpu_to_le32(free_blocks + count);
- mark_buffer_dirty(sbi->s_sbh);
- sb->s_dirt = 1;
+
+ if (free_blocks < root_blocks + count && !capable(CAP_SYS_RESOURCE) &&
+ sbi->s_resuid != current->fsuid &&
+ (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+ /*
+ * We are too close to reserve and we are not privileged.
+ * Can we allocate anything at all?
+ */
+ if (free_blocks > root_blocks)
+ count = free_blocks - root_blocks;
+ else {
+ spin_unlock(&bgi->alloc_lock);
+ return 0;
+ }
}
-}
-
-static inline int group_reserve_blocks(struct ext2_group_desc *desc,
- struct buffer_head *bh, int count)
-{
- unsigned free_blocks;
-
- if (!desc->bg_free_blocks_count)
- return 0;
-
- free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
- if (free_blocks < count)
- count = free_blocks;
desc->bg_free_blocks_count = cpu_to_le16(free_blocks - count);
+
+ spin_unlock(&bgi->alloc_lock);
+
mark_buffer_dirty(bh);
return count;
}

-static inline void group_release_blocks(struct ext2_group_desc *desc,
- struct buffer_head *bh, int count)
+static inline void group_release_blocks(struct ext2_bg_info *bgi,
+ struct ext2_group_desc *desc,
+ struct buffer_head *bh, int count)
{
if (count) {
- unsigned free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
+ unsigned free_blocks;
+
+ spin_lock(&bgi->alloc_lock);
+
+ free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count);
+
+ spin_unlock(&bgi->alloc_lock);
+
mark_buffer_dirty(bh);
}
}
@@ -172,12 +165,11 @@
unsigned long i;
unsigned long overflow;
struct super_block * sb = inode->i_sb;
+ struct ext2_sb_info * sbi = EXT2_SB(sb);
struct ext2_group_desc * desc;
- struct ext2_super_block * es;
+ struct ext2_super_block * es = sbi->s_es;
unsigned freed = 0, group_freed;

- lock_super (sb);
- es = EXT2_SB(sb)->s_es;
if (block < le32_to_cpu(es->s_first_data_block) ||
block + count < block ||
block + count > le32_to_cpu(es->s_blocks_count)) {
@@ -215,16 +207,17 @@
if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
in_range (block, le32_to_cpu(desc->bg_inode_table),
- EXT2_SB(sb)->s_itb_per_group) ||
+ sbi->s_itb_per_group) ||
in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
- EXT2_SB(sb)->s_itb_per_group))
+ sbi->s_itb_per_group))
ext2_error (sb, "ext2_free_blocks",
"Freeing blocks in system zones - "
"Block = %lu, count = %lu",
block, count);

for (i = 0, group_freed = 0; i < count; i++) {
- if (!ext2_clear_bit(bit + i, bitmap_bh->b_data))
+ if (!ext2_clear_bit_atomic(&sbi->s_bgi[block_group].alloc_lock,
+ bit + i, (void *) bitmap_bh->b_data))
ext2_error (sb, "ext2_free_blocks",
"bit already cleared for block %lu",
block + i);
@@ -236,7 +229,7 @@
if (sb->s_flags & MS_SYNCHRONOUS)
sync_dirty_buffer(bitmap_bh);

- group_release_blocks(desc, bh2, group_freed);
+ group_release_blocks(&sbi->s_bgi[block_group], desc, bh2, group_freed);
freed += group_freed;

if (overflow) {
@@ -246,18 +239,18 @@
}
error_return:
brelse(bitmap_bh);
- release_blocks(sb, freed);
- unlock_super (sb);
DQUOT_FREE_BLOCK(inode, freed);
}

-static int grab_block(char *map, unsigned size, int goal)
+static int grab_block(spinlock_t *lock, char *map, unsigned size, int goal)
{
int k;
char *p, *r;

if (!ext2_test_bit(goal, map))
goto got_it;
+
+repeat:
if (goal) {
/*
* The goal was occupied; search forward for a free
@@ -297,7 +290,8 @@
}
return -1;
got_it:
- ext2_set_bit(goal, map);
+ if (ext2_set_bit_atomic(lock, goal, (void *) map))
+ goto repeat;
return goal;
}

@@ -319,7 +313,7 @@
int ret_block; /* j */
int bit; /* k */
int target_block; /* tmp */
- int block = 0;
+ int block = 0, use_reserve = 0;
struct super_block *sb = inode->i_sb;
struct ext2_sb_info *sbi = EXT2_SB(sb);
struct ext2_super_block *es = sbi->s_es;
@@ -341,14 +335,7 @@
prealloc_goal--;

dq_alloc = prealloc_goal + 1;
-
- lock_super (sb);
-
- es_alloc = reserve_blocks(sb, dq_alloc);
- if (!es_alloc) {
- *err = -ENOSPC;
- goto out_unlock;
- }
+ es_alloc = dq_alloc;

ext2_debug ("goal=%lu.\n", goal);

@@ -360,7 +347,8 @@
if (!desc)
goto io_error;

- group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+ group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no],
+ desc, gdp_bh, es_alloc, 0);
if (group_alloc) {
ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
group_size);
@@ -371,11 +359,12 @@

ext2_debug("goal is at %d:%d.\n", group_no, ret_block);

- ret_block = grab_block(bitmap_bh->b_data,
+ ret_block = grab_block(&sbi->s_bgi[group_no].alloc_lock,
+ bitmap_bh->b_data,
group_size, ret_block);
if (ret_block >= 0)
goto got_block;
- group_release_blocks(desc, gdp_bh, group_alloc);
+ group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc);
group_alloc = 0;
}

@@ -385,6 +374,7 @@
* Now search the rest of the groups. We assume that
* i and desc correctly point to the last group visited.
*/
+repeat:
for (bit = 0; !group_alloc &&
bit < sbi->s_groups_count; bit++) {
group_no++;
@@ -393,7 +383,16 @@
desc = ext2_get_group_desc(sb, group_no, &gdp_bh);
if (!desc)
goto io_error;
- group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+ group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no],
+ desc, gdp_bh, es_alloc, use_reserve);
+ }
+ if (!use_reserve) {
+ /* first time we did not try to allocate
+ * reserved blocks. now it looks like
+ * no more non-reserved blocks left. we
+ * will try to allocate reserved blocks -bzzz */
+ use_reserve = 1;
+ goto repeat;
}
if (bit >= sbi->s_groups_count) {
*err = -ENOSPC;
@@ -404,13 +403,11 @@
if (!bitmap_bh)
goto io_error;

- ret_block = grab_block(bitmap_bh->b_data, group_size, 0);
+ ret_block = grab_block(&sbi->s_bgi[group_no].alloc_lock,
+ bitmap_bh->b_data, group_size, 0);
if (ret_block < 0) {
- ext2_error (sb, "ext2_new_block",
- "Free blocks count corrupted for block group %d",
- group_no);
group_alloc = 0;
- goto io_error;
+ goto repeat;
}

got_block:
@@ -452,7 +449,8 @@
unsigned n;

for (n = 0; n < group_alloc && ++ret_block < group_size; n++) {
- if (ext2_set_bit(ret_block, bitmap_bh->b_data))
+ if (ext2_set_bit_atomic(&sbi->s_bgi[group_no].alloc_lock,
+ ret_block, (void*) bitmap_bh->b_data))
break;
}
*prealloc_block = block + 1;
@@ -471,10 +469,7 @@

*err = 0;
out_release:
- group_release_blocks(desc, gdp_bh, group_alloc);
- release_blocks(sb, es_alloc);
-out_unlock:
- unlock_super (sb);
+ group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc);
DQUOT_FREE_BLOCK(inode, dq_alloc);
out:
brelse(bitmap_bh);
@@ -487,11 +482,11 @@

unsigned long ext2_count_free_blocks (struct super_block * sb)
{
-#ifdef EXT2FS_DEBUG
- struct ext2_super_block * es;
- unsigned long desc_count, bitmap_count, x;
struct ext2_group_desc * desc;
+ unsigned long desc_count = 0;
int i;
+#ifdef EXT2FS_DEBUG
+ unsigned long bitmap_count, x;

lock_super (sb);
es = EXT2_SB(sb)->s_es;
@@ -519,7 +514,13 @@
unlock_super (sb);
return bitmap_count;
#else
- return le32_to_cpu(EXT2_SB(sb)->s_es->s_free_blocks_count);
+ for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+ desc = ext2_get_group_desc (sb, i, NULL);
+ if (!desc)
+ continue;
+ desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+ }
+ return desc_count;
#endif
}

diff -uNr linux/fs/ext2/ialloc.c edited/fs/ext2/ialloc.c
--- linux/fs/ext2/ialloc.c Fri Mar 14 01:53:36 2003
+++ edited/fs/ext2/ialloc.c Thu Mar 13 20:08:58 2003
@@ -278,7 +278,8 @@
int ngroups = sbi->s_groups_count;
int inodes_per_group = EXT2_INODES_PER_GROUP(sb);
int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups;
- int avefreeb = le32_to_cpu(es->s_free_blocks_count) / ngroups;
+ int free_blocks = ext2_count_free_blocks(sb);
+ int avefreeb = free_blocks / ngroups;
int blocks_per_dir;
int ndirs = sbi->s_dir_count;
int max_debt, max_dirs, min_blocks, min_inodes;
@@ -320,8 +321,7 @@
goto fallback;
}

- blocks_per_dir = (le32_to_cpu(es->s_blocks_count) -
- le32_to_cpu(es->s_free_blocks_count)) / ndirs;
+ blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - free_blocks) / ndirs;

max_dirs = ndirs / ngroups + inodes_per_group / 16;
min_inodes = avefreei - inodes_per_group / 4;
@@ -340,7 +340,7 @@
desc = ext2_get_group_desc (sb, group, &bh);
if (!desc || !desc->bg_free_inodes_count)
continue;
- if (sbi->s_debts[group] >= max_debt)
+ if (sbi->s_bgi[group].debts >= max_debt)
continue;
if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
continue;
@@ -501,11 +501,11 @@
cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);

if (S_ISDIR(mode)) {
- if (EXT2_SB(sb)->s_debts[group] < 255)
- EXT2_SB(sb)->s_debts[group]++;
+ if (EXT2_SB(sb)->s_bgi[group].debts < 255)
+ EXT2_SB(sb)->s_bgi[group].debts++;
} else {
- if (EXT2_SB(sb)->s_debts[group])
- EXT2_SB(sb)->s_debts[group]--;
+ if (EXT2_SB(sb)->s_bgi[group].debts)
+ EXT2_SB(sb)->s_bgi[group].debts--;
}

mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
diff -uNr linux/fs/ext2/super.c edited/fs/ext2/super.c
--- linux/fs/ext2/super.c Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/super.c Fri Mar 14 01:46:35 2003
@@ -141,7 +141,7 @@
if (sbi->s_group_desc[i])
brelse (sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
- kfree(sbi->s_debts);
+ kfree(sbi->s_bgi);
brelse (sbi->s_sbh);
sb->s_fs_info = NULL;
kfree(sbi);
@@ -464,8 +464,11 @@
int i;
int desc_block = 0;
struct ext2_sb_info *sbi = EXT2_SB(sb);
- unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+ struct ext2_super_block * es = sbi->s_es;
+ unsigned long block = le32_to_cpu(es->s_first_data_block);
struct ext2_group_desc * gdp = NULL;
+ unsigned int total_free = 0, free;
+ unsigned int reserved = le32_to_cpu(es->s_r_blocks_count);

ext2_debug ("Checking group descriptors");

@@ -504,6 +507,31 @@
block += EXT2_BLOCKS_PER_GROUP(sb);
gdp++;
}
+
+ /* restore free blocks counter in SB -bzzz */
+ total_free = ext2_count_free_blocks(sb);
+ if (le32_to_cpu(es->s_free_blocks_count) != total_free)
+ printk(KERN_INFO "EXT2-fs: last umount wasn't clean."
+ "correct free blocks counter\n");
+ es->s_free_blocks_count = cpu_to_le32(total_free);
+
+ /* distribute reserved blocks over groups -bzzz */
+ for(i = sbi->s_groups_count-1; reserved && total_free && i >= 0; i--) {
+ gdp = ext2_get_group_desc (sb, i, NULL);
+ if (!gdp) {
+ ext2_error (sb, "ext2_check_descriptors",
+ "cant get descriptor for group %d", i);
+ return 0;
+ }
+
+ free = le16_to_cpu(gdp->bg_free_blocks_count);
+ if (free > reserved)
+ free = reserved;
+ sbi->s_bgi[i].reserved = free;
+ reserved -= free;
+ total_free -= free;
+ }
+
return 1;
}

@@ -768,13 +796,17 @@
printk ("EXT2-fs: not enough memory\n");
goto failed_mount;
}
- sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
+ sbi->s_bgi = kmalloc(sbi->s_groups_count*sizeof(struct ext2_bg_info),
GFP_KERNEL);
- if (!sbi->s_debts) {
+ if (!sbi->s_bgi) {
printk ("EXT2-fs: not enough memory\n");
goto failed_mount_group_desc;
}
- memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(*sbi->s_debts));
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ sbi->s_bgi[i].debts = 0;
+ sbi->s_bgi[i].reserved = 0;
+ spin_lock_init(&sbi->s_bgi[i].alloc_lock);
+ }
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logic_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
@@ -820,8 +852,8 @@
brelse(sbi->s_group_desc[i]);
failed_mount_group_desc:
kfree(sbi->s_group_desc);
- if (sbi->s_debts)
- kfree(sbi->s_debts);
+ if (sbi->s_bgi)
+ kfree(sbi->s_bgi);
failed_mount:
brelse(bh);
failed_sbi:
@@ -840,6 +872,7 @@

static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
{
+ es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
es->s_wtime = cpu_to_le32(get_seconds());
mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
@@ -868,6 +901,7 @@
ext2_debug ("setting valid to 0\n");
es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) &
~EXT2_VALID_FS);
+ es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
es->s_mtime = cpu_to_le32(get_seconds());
ext2_sync_super(sb, es);
} else
@@ -929,7 +963,8 @@
static int ext2_statfs (struct super_block * sb, struct statfs * buf)
{
struct ext2_sb_info *sbi = EXT2_SB(sb);
- unsigned long overhead;
+ unsigned long overhead, total_free = 0;
+ struct ext2_group_desc *desc;
int i;

if (test_opt (sb, MINIX_DF))
@@ -950,9 +985,14 @@
* block group descriptors. If the sparse superblocks
* feature is turned on, then not all groups have this.
*/
- for (i = 0; i < sbi->s_groups_count; i++)
+ for (i = 0; i < sbi->s_groups_count; i++) {
overhead += ext2_bg_has_super(sb, i) +
ext2_bg_num_gdb(sb, i);
+
+ /* sum total free blocks -bzzz */
+ desc = ext2_get_group_desc (sb, i, NULL);
+ total_free += le16_to_cpu(desc->bg_free_blocks_count);
+ }

/*
* Every block group has an inode bitmap, a block
@@ -965,7 +1005,7 @@
buf->f_type = EXT2_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = le32_to_cpu(sbi->s_es->s_blocks_count) - overhead;
- buf->f_bfree = ext2_count_free_blocks (sb);
+ buf->f_bfree = total_free;
buf->f_bavail = buf->f_bfree - le32_to_cpu(sbi->s_es->s_r_blocks_count);
if (buf->f_bfree < le32_to_cpu(sbi->s_es->s_r_blocks_count))
buf->f_bavail = 0;
diff -uNr linux/include/asm-alpha/bitops.h edited/include/asm-alpha/bitops.h
--- linux/include/asm-alpha/bitops.h Fri Mar 14 01:53:36 2003
+++ edited/include/asm-alpha/bitops.h Thu Mar 13 14:10:18 2003
@@ -487,7 +487,9 @@


#define ext2_set_bit __test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a)
#define ext2_clear_bit __test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
#define ext2_test_bit test_bit
#define ext2_find_first_zero_bit find_first_zero_bit
#define ext2_find_next_zero_bit find_next_zero_bit
diff -uNr linux/include/asm-arm/bitops.h edited/include/asm-arm/bitops.h
--- linux/include/asm-arm/bitops.h Fri Mar 14 01:53:36 2003
+++ edited/include/asm-arm/bitops.h Thu Mar 13 14:10:46 2003
@@ -357,8 +357,12 @@
*/
#define ext2_set_bit(nr,p) \
__test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
+#define ext2_set_bit_atomic(lock,nr,p) \
+ test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
#define ext2_clear_bit(nr,p) \
__test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
+#define ext2_clear_bit_atomic(lock,nr,p) \
+ test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
#define ext2_test_bit(nr,p) \
__test_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
#define ext2_find_first_zero_bit(p,sz) \
diff -uNr linux/include/asm-cris/bitops.h edited/include/asm-cris/bitops.h
--- linux/include/asm-cris/bitops.h Mon Nov 11 06:28:30 2002
+++ edited/include/asm-cris/bitops.h Thu Mar 13 14:11:15 2003
@@ -360,7 +360,9 @@
#define hweight8(x) generic_hweight8(x)

#define ext2_set_bit test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a)
#define ext2_clear_bit test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
#define ext2_test_bit test_bit
#define ext2_find_first_zero_bit find_first_zero_bit
#define ext2_find_next_zero_bit find_next_zero_bit
diff -uNr linux/include/asm-i386/bitops.h edited/include/asm-i386/bitops.h
--- linux/include/asm-i386/bitops.h Wed Dec 25 06:03:08 2002
+++ edited/include/asm-i386/bitops.h Thu Mar 13 14:11:32 2003
@@ -479,8 +479,12 @@

#define ext2_set_bit(nr,addr) \
__test_and_set_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock,nr,addr) \
+ test_and_set_bit((nr),(unsigned long*)addr)
#define ext2_clear_bit(nr, addr) \
__test_and_clear_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock,nr, addr) \
+ test_and_clear_bit((nr),(unsigned long*)addr)
#define ext2_test_bit(nr, addr) test_bit((nr),(unsigned long*)addr)
#define ext2_find_first_zero_bit(addr, size) \
find_first_zero_bit((unsigned long*)addr, size)
diff -uNr linux/include/asm-ia64/bitops.h edited/include/asm-ia64/bitops.h
--- linux/include/asm-ia64/bitops.h Thu Feb 20 16:18:21 2003
+++ edited/include/asm-ia64/bitops.h Thu Mar 13 14:12:50 2003
@@ -453,7 +453,9 @@
#define __clear_bit(nr, addr) clear_bit(nr, addr)

#define ext2_set_bit test_and_set_bit
+#define ext2_set_atomic(l,n,a) test_and_set_bit(n,a)
#define ext2_clear_bit test_and_clear_bit
+#define ext2_clear_atomic(l,n,a) test_and_clear_bit(n,a)
#define ext2_test_bit test_bit
#define ext2_find_first_zero_bit find_first_zero_bit
#define ext2_find_next_zero_bit find_next_zero_bit
diff -uNr linux/include/asm-m68k/bitops.h edited/include/asm-m68k/bitops.h
--- linux/include/asm-m68k/bitops.h Mon Nov 11 06:28:33 2002
+++ edited/include/asm-m68k/bitops.h Thu Mar 13 14:15:31 2003
@@ -355,6 +355,16 @@
}

extern __inline__ int
+ext2_set_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, vaddr);
+ spin_unlock(lock);
+ return ret;
+}
+
+extern __inline__ int
ext2_clear_bit (int nr, volatile void *vaddr)
{
char retval;
@@ -366,6 +376,16 @@
}

extern __inline__ int
+ext2_clear_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, vaddr);
+ spin_unlock(lock);
+ return ret;
+}
+
+extern __inline__ int
ext2_test_bit (int nr, const volatile void *vaddr)
{
return ((1U << (nr & 7)) & (((const volatile unsigned char *) vaddr)[nr >> 3])) != 0;
diff -uNr linux/include/asm-m68knommu/bitops.h edited/include/asm-m68knommu/bitops.h
--- linux/include/asm-m68knommu/bitops.h Mon Nov 11 06:28:04 2002
+++ edited/include/asm-m68knommu/bitops.h Thu Mar 13 14:18:21 2003
@@ -387,6 +387,16 @@
return retval;
}

+extern __inline__ int ext2_set_bit_atomic(spinlock_t *lock, int nr,
+ volatile void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int ext2_clear_bit(int nr, volatile void * addr)
{
int mask, retval;
@@ -402,6 +412,16 @@
return retval;
}

+extern __inline__ int ext2_clear_bit_atomic(spinlock_t *lock, int nr,
+ volatile void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int ext2_test_bit(int nr, const volatile void * addr)
{
int mask;
diff -uNr linux/include/asm-mips/bitops.h edited/include/asm-mips/bitops.h
--- linux/include/asm-mips/bitops.h Mon Nov 11 06:28:03 2002
+++ edited/include/asm-mips/bitops.h Thu Mar 13 14:24:52 2003
@@ -810,6 +810,15 @@
return retval;
}

+extern __inline__ int ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int ext2_clear_bit(int nr, void * addr)
{
int mask, retval, flags;
@@ -824,6 +833,15 @@
return retval;
}

+extern __inline__ int ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int ext2_test_bit(int nr, const void * addr)
{
int mask;
@@ -890,7 +908,9 @@

/* Native ext2 byte ordering, just collapse using defines. */
#define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr))
+#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr), (addr))
#define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr), (addr))
#define ext2_test_bit(nr, addr) test_bit((nr), (addr))
#define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size))
#define ext2_find_next_zero_bit(addr, size, offset) \
diff -uNr linux/include/asm-mips64/bitops.h edited/include/asm-mips64/bitops.h
--- linux/include/asm-mips64/bitops.h Mon Nov 11 06:28:29 2002
+++ edited/include/asm-mips64/bitops.h Thu Mar 13 14:27:26 2003
@@ -517,6 +517,16 @@
}

extern inline int
+ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
+extern inline int
ext2_clear_bit(int nr, void * addr)
{
int mask, retval, flags;
@@ -532,6 +542,16 @@
}

extern inline int
+ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
+extern inline int
ext2_test_bit(int nr, const void * addr)
{
int mask;
@@ -599,7 +619,9 @@

/* Native ext2 byte ordering, just collapse using defines. */
#define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr))
+#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr), (addr))
#define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr), (addr))
#define ext2_test_bit(nr, addr) test_bit((nr), (addr))
#define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size))
#define ext2_find_next_zero_bit(addr, size, offset) \
diff -uNr linux/include/asm-parisc/bitops.h edited/include/asm-parisc/bitops.h
--- linux/include/asm-parisc/bitops.h Thu Feb 20 16:18:21 2003
+++ edited/include/asm-parisc/bitops.h Thu Mar 13 14:29:47 2003
@@ -389,10 +389,14 @@
*/
#ifdef __LP64__
#define ext2_set_bit(nr, addr) test_and_set_bit((nr) ^ 0x38, addr)
+#define ext2_set_bit_atomic(l,nr,addr) test_and_set_bit((nr) ^ 0x38, addr)
#define ext2_clear_bit(nr, addr) test_and_clear_bit((nr) ^ 0x38, addr)
+#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x38, addr)
#else
#define ext2_set_bit(nr, addr) test_and_set_bit((nr) ^ 0x18, addr)
+#define ext2_set_bit_atomic(l,nr,addr) test_and_set_bit((nr) ^ 0x18, addr)
#define ext2_clear_bit(nr, addr) test_and_clear_bit((nr) ^ 0x18, addr)
+#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x18, addr)
#endif

#endif /* __KERNEL__ */
diff -uNr linux/include/asm-ppc/bitops.h edited/include/asm-ppc/bitops.h
--- linux/include/asm-ppc/bitops.h Mon Jan 20 05:23:05 2003
+++ edited/include/asm-ppc/bitops.h Thu Mar 13 14:31:00 2003
@@ -392,7 +392,9 @@


#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
+#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))

static __inline__ int ext2_test_bit(int nr, __const__ void * addr)
{
diff -uNr linux/include/asm-ppc64/bitops.h edited/include/asm-ppc64/bitops.h
--- linux/include/asm-ppc64/bitops.h Mon Nov 11 06:28:28 2002
+++ edited/include/asm-ppc64/bitops.h Thu Mar 13 14:32:23 2003
@@ -336,8 +336,12 @@

#define ext2_set_bit(nr,addr) \
__test_and_set_le_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock, nr,addr) \
+ test_and_set_le_bit((nr),(unsigned long*)addr)
#define ext2_clear_bit(nr, addr) \
__test_and_clear_le_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+ test_and_clear_le_bit((nr),(unsigned long*)addr)
#define ext2_test_bit(nr, addr) test_le_bit((nr),(unsigned long*)addr)
#define ext2_find_first_zero_bit(addr, size) \
find_first_zero_le_bit((unsigned long*)addr, size)
diff -uNr linux/include/asm-s390/bitops.h edited/include/asm-s390/bitops.h
--- linux/include/asm-s390/bitops.h Fri Mar 14 01:53:27 2003
+++ edited/include/asm-s390/bitops.h Thu Mar 13 14:33:55 2003
@@ -805,8 +805,12 @@

#define ext2_set_bit(nr, addr) \
test_and_set_bit((nr)^24, (unsigned long *)addr)
+#define ext2_set_bit_atomic(lock, nr, addr) \
+ test_and_set_bit((nr)^24, (unsigned long *)addr)
#define ext2_clear_bit(nr, addr) \
test_and_clear_bit((nr)^24, (unsigned long *)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+ test_and_clear_bit((nr)^24, (unsigned long *)addr)
#define ext2_test_bit(nr, addr) \
test_bit((nr)^24, (unsigned long *)addr)

diff -uNr linux/include/asm-s390x/bitops.h edited/include/asm-s390x/bitops.h
--- linux/include/asm-s390x/bitops.h Fri Mar 14 01:53:27 2003
+++ edited/include/asm-s390x/bitops.h Thu Mar 13 14:35:22 2003
@@ -838,8 +838,12 @@

#define ext2_set_bit(nr, addr) \
test_and_set_bit((nr)^56, (unsigned long *)addr)
+#define ext2_set_bit_atomic(lock, nr, addr) \
+ test_and_set_bit((nr)^56, (unsigned long *)addr)
#define ext2_clear_bit(nr, addr) \
test_and_clear_bit((nr)^56, (unsigned long *)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+ test_and_clear_bit((nr)^56, (unsigned long *)addr)
#define ext2_test_bit(nr, addr) \
test_bit((nr)^56, (unsigned long *)addr)

diff -uNr linux/include/asm-sh/bitops.h edited/include/asm-sh/bitops.h
--- linux/include/asm-sh/bitops.h Mon Nov 11 06:28:02 2002
+++ edited/include/asm-sh/bitops.h Thu Mar 13 14:37:18 2003
@@ -265,6 +265,16 @@
return retval;
}

+static __inline__ int ext2_set_bit_atomic(spinlock_t *lock,
+ int nr, volatile void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
static __inline__ int ext2_clear_bit(int nr, volatile void * addr)
{
int mask, retval;
@@ -280,6 +290,16 @@
return retval;
}

+static __inline__ int ext2_clear_bit_atomic(spinlock_t *lock,
+ int nr, volatile void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
static __inline__ int ext2_test_bit(int nr, const volatile void * addr)
{
int mask;
diff -uNr linux/include/asm-sparc/bitops.h edited/include/asm-sparc/bitops.h
--- linux/include/asm-sparc/bitops.h Mon Jan 20 05:23:05 2003
+++ edited/include/asm-sparc/bitops.h Thu Mar 13 14:38:54 2003
@@ -454,7 +454,9 @@
find_next_zero_le_bit((addr), (size), 0)

#define ext2_set_bit __test_and_set_le_bit
+#define ext2_set_bit_atomic(l,n,a) test_and_set_le_bit(n,a)
#define ext2_clear_bit __test_and_clear_le_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_le_bit(n,a)
#define ext2_test_bit test_le_bit
#define ext2_find_first_zero_bit find_first_zero_le_bit
#define ext2_find_next_zero_bit find_next_zero_le_bit
diff -uNr linux/include/asm-sparc64/bitops.h edited/include/asm-sparc64/bitops.h
--- linux/include/asm-sparc64/bitops.h Mon Nov 11 06:28:05 2002
+++ edited/include/asm-sparc64/bitops.h Thu Mar 13 14:43:49 2003
@@ -351,7 +351,9 @@
#ifdef __KERNEL__

#define ext2_set_bit(nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr))
+#define ext2_set_bit_atomic(lock,nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr))
#define ext2_clear_bit(nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock,nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr))
#define ext2_test_bit(nr,addr) test_le_bit((nr),(unsigned long *)(addr))
#define ext2_find_first_zero_bit(addr, size) \
find_first_zero_le_bit((unsigned long *)(addr), (size))
diff -uNr linux/include/asm-v850/bitops.h edited/include/asm-v850/bitops.h
--- linux/include/asm-v850/bitops.h Mon Nov 11 06:28:02 2002
+++ edited/include/asm-v850/bitops.h Thu Mar 13 14:44:48 2003
@@ -252,7 +252,9 @@
#define hweight8(x) generic_hweight8 (x)

#define ext2_set_bit test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a)
#define ext2_clear_bit test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
#define ext2_test_bit test_bit
#define ext2_find_first_zero_bit find_first_zero_bit
#define ext2_find_next_zero_bit find_next_zero_bit
diff -uNr linux/include/asm-x86_64/bitops.h edited/include/asm-x86_64/bitops.h
--- linux/include/asm-x86_64/bitops.h Fri Mar 14 01:53:27 2003
+++ edited/include/asm-x86_64/bitops.h Thu Mar 13 14:45:56 2003
@@ -487,8 +487,12 @@

#define ext2_set_bit(nr,addr) \
__test_and_set_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock,nr,addr) \
+ test_and_set_bit((nr),(unsigned long*)addr)
#define ext2_clear_bit(nr, addr) \
__test_and_clear_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock,nr,addr) \
+ test_and_clear_bit((nr),(unsigned long*)addr)
#define ext2_test_bit(nr, addr) test_bit((nr),(unsigned long*)addr)
#define ext2_find_first_zero_bit(addr, size) \
find_first_zero_bit((unsigned long*)addr, size)
diff -uNr linux/include/linux/ext2_fs_sb.h edited/include/linux/ext2_fs_sb.h
--- linux/include/linux/ext2_fs_sb.h Mon Nov 11 06:28:30 2002
+++ edited/include/linux/ext2_fs_sb.h Fri Mar 14 01:41:31 2003
@@ -16,6 +16,12 @@
#ifndef _LINUX_EXT2_FS_SB
#define _LINUX_EXT2_FS_SB

+struct ext2_bg_info {
+ u8 debts;
+ spinlock_t alloc_lock;
+ unsigned int reserved;
+} ____cacheline_aligned_in_smp;
+
/*
* second extended-fs super-block data in memory
*/
@@ -44,7 +50,7 @@
int s_first_ino;
u32 s_next_generation;
unsigned long s_dir_count;
- u8 *s_debts;
+ struct ext2_bg_info *s_bgi;
};

#endif /* _LINUX_EXT2_FS_SB */

2003-03-13 23:13:42

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

Andreas Dilger <[email protected]> wrote:
>
> > I think it's probably better to just lump all the root-reserved blocks
> > into as few blockgroups as possible.
>
> I might disagree here. One of the reasons for having the reserved blocks
> is to prevent fragmentation, and not necessarily to reserve space for root.
> For the lots of small files cases it makes more sense to leave free space
> in each group to prevent fragmentation at the group level.

Alex's approach effectively makes every blockgroup a little bit smaller. I
don't expect it will improve fragmentation effects. Not sure...

> ...
> We could also say that for the purpose of allocating new files in a directory,
> anything more than 95% full is "full" and the inode should be allocated in
> a different group regardless of where the parent is. It may be that the
> Orlov allocator already has such a heuristic, but I think that is a different
> discussion.

Yes, both find_group_other() and find_group_orlov() do things like that.

But only in 2.5, or in 2.4 with Ted's backport patches. find_group_other()
in 2.4 forgets to look at the free block count, which is rather sad.

2003-03-13 23:26:10

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

Alex Tomas <[email protected]> wrote:
>
>
> done
> done

Thanks!

>
> AM> c) It looks like EXT2FS_DEBUG broke. Nobody uses that much, but
> AM> we should fix and test it sometime.
>
> I suggest this to be fixed in separate patch. are you?

Yes, that's fine.

> ...
> btw, what about minor bug in ext2 allocation code I posted recently?
> do you agree it needs to be fixed?

That's still in my inbox. I do not silently drop stuff, but am sometimes
laggy.


2003-03-13 23:46:17

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

First of all, thanks for this work, Alex. It's been a long time in coming.

One thing I would wonder about is whether we should be implementing this in
ext2, or in ext3 only. One of the decisions we made in the past is that we
shouldn't necessarily implement everything in ext2 (especially features that
complicated the code, and are only useful on high-end systems).

There was a desire to keep ext2 small and simple, and ext3 would get the
fancy high-end features that make sense if you have a large filesystem
that you would likely be using in conjunction with ext3 anyways.

It does make sense to test this out on ext2 since it is definitely easier
to code for ext2 than ext3, and the journaling doesn't skew the performance
so much. Of course one of the reasons that ext2 is easier to code for is
exactly _because_ we don't put all of the features into ext2...

Comments on the code inline below...

On Mar 13, 2003 22:17 +0300, Alex Tomas wrote:
> -static inline int reserve_blocks(struct super_block *sb, int count)
> +static inline int group_reserve_blocks(struct ext2_sb_info *sbi, struct ext2_bg_info *bgi,
> + struct ext2_group_desc *desc,
> + struct buffer_head *bh, int count, int use_reserve)

I would suggest just hooking the ext2_group_desc (and the buffer_head in
which it lives) off of the ext2_bg_info array instead of passing both
around explicitly. Since we have ext2_bg_info as a group_number-indexed
array already, this would essentially mean that wherever we call
ext2_get_group_desc() we could just use sbi->bgi[group].desc (or make
ext2_get_group_desc() do that, if we don't need it to populate bgi[group].desc
in the first place).

> + root_blocks = bgi->reserved;

I would avoid calling this "root_blocks" and instead just use "bgi->reserved"
or "reserved_blocks" everywhere. The original intent of these blocks was to
reduce fragmentation and not necessarily reserved-for-root.

> + if (free_blocks < root_blocks + count && !capable(CAP_SYS_RESOURCE) &&
> + sbi->s_resuid != current->fsuid &&
> + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
> + /*
> + * We are too close to reserve and we are not privileged.
> + * Can we allocate anything at all?
> + */
> + if (free_blocks > root_blocks)
> + count = free_blocks - root_blocks;
> + else {
> + spin_unlock(&bgi->alloc_lock);
> + return 0;
> + }

Per my other email, if we want to handle large files properly by allowing them
to fill the entire group, yet we want to keep the "reserved blocks" count
correct, we could always grab the lock on the last group and add reserved
blocks there. Or, we could just ignore the reserved blocks count entirely.

> unsigned long ext2_count_free_blocks (struct super_block * sb)
> :
> - return le32_to_cpu(EXT2_SB(sb)->s_es->s_free_blocks_count);
> + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
> + desc = ext2_get_group_desc (sb, i, NULL);
> + if (!desc)
> + continue;
> + desc_count += le16_to_cpu(desc->bg_free_blocks_count);
> + }
> + return desc_count;
> #endif

In general, this should be safe to do without a lock, since it is only
used for heuristics (orlov) and statfs (which is out-of-date as soon as
we call it). Are there any other users of ext2_count_free_blocks() that
need a correct value? I suppose mount/unmount to set s_free_blocks_count,
but those probably have exclusive access to the filesystem anyways.

PS - it looks like you are using spaces for indents instead of tabs here...

> + if (le32_to_cpu(es->s_free_blocks_count) != total_free)
> + printk(KERN_INFO "EXT2-fs: last umount wasn't clean. correct free blocks counter\n");

Probably no need to print this for ext2, since there is already an "uncleanly
unmounted" flag in the superblock, and e2fsck will have otherwise fixed it
up.

> + /* distribute reserved blocks over groups -bzzz */
> + while (reserved && total_free) {
> + unsigned int per_group = reserved / sbi->s_groups_count + 1;
> + unsigned int free;
> +
> + for (i = 0; reserved && i < sbi->s_groups_count; i++) {
> + gdp = ext2_get_group_desc (sb, i, NULL);
> + if (!gdp) {
> + ext2_error (sb, "ext2_check_descriptors",
> + "can't get descriptor for group #%d", i);
> + return 0;
> + }
> +
> + free = le16_to_cpu(gdp->bg_free_blocks_count);
> + if (per_group > free)
> + per_group = free;

I'm not sure whether I agree with this or not... If a group ever exceeds
the reserved mark for some reason (e.g. full filesystem) it will never be
able to "improve itself" back to a decent amount of reserved blocks. That
said, you may want to only reduce "reserved" by "free" in the end, so that
the total amount of reserved blocks is kept constant.
need to re-calculate "per_group" for each loop).

> extern __inline__ int
> +ext2_set_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)

Please don't use "extern __inline__", as that can cause all sorts of
grief. Either "static inline" or just "extern".

> +struct ext2_bg_info {
> + u8 debts;
> + spinlock_t alloc_lock;
> + unsigned int reserved;
> +};

Please rename this "balloc_lock", as it is likely that we will get an
"ialloc_lock" in the future also.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2003-03-14 07:17:59

by Alex Tomas

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64


hi!

>>>>> Andreas Dilger (AD) writes:

AD> First of all, thanks for this work, Alex. It's been a long time
AD> in coming. One thing I would wonder about is whether we should
AD> be implementing this in ext2, or in ext3 only. One of the
AD> decisions we made in the past is that we shouldn't necessarily
AD> implement everything in ext2 (especially features that
AD> complicated the code, and are only useful on high-end systems).

well. ext2 in 2.4 have a lot of BKL. ext2 in 2.5 is almost free from
BKL. I think concurrent balloc is just one more step in this direction.

AD> I would suggest just hooking the ext2_group_desc (and the
AD> buffer_head in which it lives) off of the ext2_bg_info array
AD> instead of passing both around explicitly. Since we have
AD> ext2_bg_info as a group_number-indexed array already, this would
AD> essentially mean that wherever we call ext2_get_group_desc() we
AD> could just use sbi->bgi[group].desc (or make
AD> ext2_get_group_desc() do that, if we don't need it to populate
AD> bgi[group].desc in the first place).

it make sense. what about to make it by separate patch?
just to prevent huge concurrent-balloc.diff


>> + root_blocks = bgi->reserved;

AD> I would avoid calling this "root_blocks" and instead just use
AD> "bgi->reserved" or "reserved_blocks" everywhere. The original
AD> intent of these blocks was to reduce fragmentation and not
AD> necessarily reserved-for-root.

fixed

>> + if (free_blocks < root_blocks + count &&
>> !capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid &&
>> + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + /* + *
>> We are too close to reserve and we are not privileged. + * Can we
>> allocate anything at all? + */ + if (free_blocks > root_blocks) +
>> count = free_blocks - root_blocks; + else { +
>> spin_unlock(&bgi->alloc_lock); + return 0; + }

AD> Per my other email, if we want to handle large files properly by
AD> allowing them to fill the entire group, yet we want to keep the
AD> "reserved blocks" count correct, we could always grab the lock on
AD> the last group and add reserved blocks there. Or, we could just
AD> ignore the reserved blocks count entirely.

hmm. looks I miss something here. reservation is protected by the lock.
what's the problem?

>> unsigned long ext2_count_free_blocks (struct super_block * sb)
>> :
>> - return le32_to_cpu(EXT2_SB(sb)->s_es->s_free_blocks_count); +
>> for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + desc =
>> ext2_get_group_desc (sb, i, NULL); + if (!desc) + continue; +
>> desc_count += le16_to_cpu(desc->bg_free_blocks_count); + } +
>> return desc_count; #endif

this code doesn't use lock. if you mean code under EXT2FS_DEBUG then
Andrew already pointed out that this code is broken and should be fixed.

AD> In general, this should be safe to do without a lock, since it is
AD> only used for heuristics (orlov) and statfs (which is out-of-date
AD> as soon as we call it). Are there any other users of
AD> ext2_count_free_blocks() that need a correct value? I suppose
AD> mount/unmount to set s_free_blocks_count, but those probably have
AD> exclusive access to the filesystem anyways.

there is no more user of ext2_count_free_block() who needs precise counter.
find_group_orlov() uses it, but I think this routine doesn't need this
loop is serialized against block reservation.

AD> PS - it looks like you are using spaces for indents instead of
AD> tabs here...

I just use vim ;)

>> + if (le32_to_cpu(es->s_free_blocks_count) != total_free) +
>> printk(KERN_INFO "EXT2-fs: last umount wasn't clean. correct free
>> blocks counter\n");

AD> Probably no need to print this for ext2, since there is already
AD> an "uncleanly unmounted" flag in the superblock, and e2fsck will
AD> have otherwise fixed it up.

fixed. in fact. it was 'debug for myself'.

AD> I'm not sure whether I agree with this or not... If a group ever
AD> exceeds the reserved mark for some reason (e.g. full filesystem)
AD> it will never be able to "improve itself" back to a decent amount
AD> of reserved blocks. That said, you may want to only reduce
AD> "reserved" by "free" in the end, so that the total amount of
AD> reserved blocks is kept constant. need to re-calculate
AD> "per_group" for each loop).

well, I believe reserved blocks may be really _reserved_ at the end of
the fs. simple because of nobody should use them until fs is almost full.

>> extern __inline__ int +ext2_set_bit_atomic (spinlock_t *lock, int
>> nr, volatile void *vaddr)

AD> Please don't use "extern __inline__", as that can cause all sorts
AD> of grief. Either "static inline" or just "extern".

fixed

>> +struct ext2_bg_info { + u8 debts; + spinlock_t alloc_lock; +
>> unsigned int reserved; +};

AD> Please rename this "balloc_lock", as it is likely that we will
AD> get an "ialloc_lock" in the future also.

this makes sense as well.


and corrected patch:



diff -uNr linux/fs/ext2/balloc.c edited/fs/ext2/balloc.c
--- linux/fs/ext2/balloc.c Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/balloc.c Fri Mar 14 09:54:11 2003
@@ -94,69 +94,62 @@
return bh;
}

-static inline int reserve_blocks(struct super_block *sb, int count)
+static inline int group_reserve_blocks(struct ext2_sb_info *sbi, struct ext2_bg_info *bgi,
+ struct ext2_group_desc *desc,
+ struct buffer_head *bh, int count, int use_reserve)
{
- struct ext2_sb_info * sbi = EXT2_SB(sb);
- struct ext2_super_block * es = sbi->s_es;
- unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
- unsigned root_blocks = le32_to_cpu(es->s_r_blocks_count);
+ unsigned free_blocks;
+ unsigned root_blocks;

+ spin_lock(&bgi->balloc_lock);
+
+ free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
if (free_blocks < count)
count = free_blocks;
+ root_blocks = bgi->reserved;

- if (free_blocks < root_blocks + count && !capable(CAP_SYS_RESOURCE) &&
- sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
- /*
- * We are too close to reserve and we are not privileged.
- * Can we allocate anything at all?
- */
- if (free_blocks > root_blocks)
- count = free_blocks - root_blocks;
- else
- return 0;
+ if (free_blocks < bgi->reserved && !use_reserve) {
+ /* don't use reserved blocks */
+ spin_unlock(&bgi->balloc_lock);
+ return 0;
}
-
- es->s_free_blocks_count = cpu_to_le32(free_blocks - count);
- mark_buffer_dirty(sbi->s_sbh);
- sb->s_dirt = 1;
- return count;
-}
-
-static inline void release_blocks(struct super_block *sb, int count)
-{
- if (count) {
- struct ext2_sb_info * sbi = EXT2_SB(sb);
- struct ext2_super_block * es = sbi->s_es;
- unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
- es->s_free_blocks_count = cpu_to_le32(free_blocks + count);
- mark_buffer_dirty(sbi->s_sbh);
- sb->s_dirt = 1;
+
+ if (free_blocks < bgi->reserved + count && !capable(CAP_SYS_RESOURCE) &&
+ sbi->s_resuid != current->fsuid &&
+ (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+ /*
+ * We are too close to reserve and we are not privileged.
+ * Can we allocate anything at all?
+ */
+ if (free_blocks > bgi->reserved)
+ count = free_blocks - bgi->reserved;
+ else {
+ spin_unlock(&bgi->balloc_lock);
+ return 0;
+ }
}
-}
-
-static inline int group_reserve_blocks(struct ext2_group_desc *desc,
- struct buffer_head *bh, int count)
-{
- unsigned free_blocks;
-
- if (!desc->bg_free_blocks_count)
- return 0;
-
- free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
- if (free_blocks < count)
- count = free_blocks;
desc->bg_free_blocks_count = cpu_to_le16(free_blocks - count);
+
+ spin_unlock(&bgi->balloc_lock);
+
mark_buffer_dirty(bh);
return count;
}

-static inline void group_release_blocks(struct ext2_group_desc *desc,
- struct buffer_head *bh, int count)
+static inline void group_release_blocks(struct ext2_bg_info *bgi,
+ struct ext2_group_desc *desc,
+ struct buffer_head *bh, int count)
{
if (count) {
- unsigned free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
+ unsigned free_blocks;
+
+ spin_lock(&bgi->balloc_lock);
+
+ free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count);
+
+ spin_unlock(&bgi->balloc_lock);
+
mark_buffer_dirty(bh);
}
}
@@ -172,12 +165,11 @@
unsigned long i;
unsigned long overflow;
struct super_block * sb = inode->i_sb;
+ struct ext2_sb_info * sbi = EXT2_SB(sb);
struct ext2_group_desc * desc;
- struct ext2_super_block * es;
+ struct ext2_super_block * es = sbi->s_es;
unsigned freed = 0, group_freed;

- lock_super (sb);
- es = EXT2_SB(sb)->s_es;
if (block < le32_to_cpu(es->s_first_data_block) ||
block + count < block ||
block + count > le32_to_cpu(es->s_blocks_count)) {
@@ -215,16 +207,17 @@
if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
in_range (block, le32_to_cpu(desc->bg_inode_table),
- EXT2_SB(sb)->s_itb_per_group) ||
+ sbi->s_itb_per_group) ||
in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
- EXT2_SB(sb)->s_itb_per_group))
+ sbi->s_itb_per_group))
ext2_error (sb, "ext2_free_blocks",
"Freeing blocks in system zones - "
"Block = %lu, count = %lu",
block, count);

for (i = 0, group_freed = 0; i < count; i++) {
- if (!ext2_clear_bit(bit + i, bitmap_bh->b_data))
+ if (!ext2_clear_bit_atomic(&sbi->s_bgi[block_group].balloc_lock,
+ bit + i, (void *) bitmap_bh->b_data))
ext2_error (sb, "ext2_free_blocks",
"bit already cleared for block %lu",
block + i);
@@ -236,7 +229,7 @@
if (sb->s_flags & MS_SYNCHRONOUS)
sync_dirty_buffer(bitmap_bh);

- group_release_blocks(desc, bh2, group_freed);
+ group_release_blocks(&sbi->s_bgi[block_group], desc, bh2, group_freed);
freed += group_freed;

if (overflow) {
@@ -246,18 +239,18 @@
}
error_return:
brelse(bitmap_bh);
- release_blocks(sb, freed);
- unlock_super (sb);
DQUOT_FREE_BLOCK(inode, freed);
}

-static int grab_block(char *map, unsigned size, int goal)
+static int grab_block(spinlock_t *lock, char *map, unsigned size, int goal)
{
int k;
char *p, *r;

if (!ext2_test_bit(goal, map))
goto got_it;
+
+repeat:
if (goal) {
/*
* The goal was occupied; search forward for a free
@@ -297,7 +290,8 @@
}
return -1;
got_it:
- ext2_set_bit(goal, map);
+ if (ext2_set_bit_atomic(lock, goal, (void *) map))
+ goto repeat;
return goal;
}

@@ -319,7 +313,7 @@
int ret_block; /* j */
int bit; /* k */
int target_block; /* tmp */
- int block = 0;
+ int block = 0, use_reserve = 0;
struct super_block *sb = inode->i_sb;
struct ext2_sb_info *sbi = EXT2_SB(sb);
struct ext2_super_block *es = sbi->s_es;
@@ -341,14 +335,7 @@
prealloc_goal--;

dq_alloc = prealloc_goal + 1;
-
- lock_super (sb);
-
- es_alloc = reserve_blocks(sb, dq_alloc);
- if (!es_alloc) {
- *err = -ENOSPC;
- goto out_unlock;
- }
+ es_alloc = dq_alloc;

ext2_debug ("goal=%lu.\n", goal);

@@ -360,7 +347,8 @@
if (!desc)
goto io_error;

- group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+ group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no],
+ desc, gdp_bh, es_alloc, 0);
if (group_alloc) {
ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
group_size);
@@ -371,11 +359,12 @@

ext2_debug("goal is at %d:%d.\n", group_no, ret_block);

- ret_block = grab_block(bitmap_bh->b_data,
+ ret_block = grab_block(&sbi->s_bgi[group_no].balloc_lock,
+ bitmap_bh->b_data,
group_size, ret_block);
if (ret_block >= 0)
goto got_block;
- group_release_blocks(desc, gdp_bh, group_alloc);
+ group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc);
group_alloc = 0;
}

@@ -385,6 +374,7 @@
* Now search the rest of the groups. We assume that
* i and desc correctly point to the last group visited.
*/
+repeat:
for (bit = 0; !group_alloc &&
bit < sbi->s_groups_count; bit++) {
group_no++;
@@ -393,7 +383,16 @@
desc = ext2_get_group_desc(sb, group_no, &gdp_bh);
if (!desc)
goto io_error;
- group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+ group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no],
+ desc, gdp_bh, es_alloc, use_reserve);
+ }
+ if (!use_reserve) {
+ /* first time we did not try to allocate
+ * reserved blocks. now it looks like
+ * no more non-reserved blocks left. we
+ * will try to allocate reserved blocks -bzzz */
+ use_reserve = 1;
+ goto repeat;
}
if (bit >= sbi->s_groups_count) {
*err = -ENOSPC;
@@ -404,13 +403,11 @@
if (!bitmap_bh)
goto io_error;

- ret_block = grab_block(bitmap_bh->b_data, group_size, 0);
+ ret_block = grab_block(&sbi->s_bgi[group_no].balloc_lock,
+ bitmap_bh->b_data, group_size, 0);
if (ret_block < 0) {
- ext2_error (sb, "ext2_new_block",
- "Free blocks count corrupted for block group %d",
- group_no);
group_alloc = 0;
- goto io_error;
+ goto repeat;
}

got_block:
@@ -452,7 +449,8 @@
unsigned n;

for (n = 0; n < group_alloc && ++ret_block < group_size; n++) {
- if (ext2_set_bit(ret_block, bitmap_bh->b_data))
+ if (ext2_set_bit_atomic(&sbi->s_bgi[group_no].balloc_lock,
+ ret_block, (void*) bitmap_bh->b_data))
break;
}
*prealloc_block = block + 1;
@@ -471,10 +469,7 @@

*err = 0;
out_release:
- group_release_blocks(desc, gdp_bh, group_alloc);
- release_blocks(sb, es_alloc);
-out_unlock:
- unlock_super (sb);
+ group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc);
DQUOT_FREE_BLOCK(inode, dq_alloc);
out:
brelse(bitmap_bh);
@@ -487,11 +482,11 @@

unsigned long ext2_count_free_blocks (struct super_block * sb)
{
-#ifdef EXT2FS_DEBUG
- struct ext2_super_block * es;
- unsigned long desc_count, bitmap_count, x;
struct ext2_group_desc * desc;
+ unsigned long desc_count = 0;
int i;
+#ifdef EXT2FS_DEBUG
+ unsigned long bitmap_count, x;

lock_super (sb);
es = EXT2_SB(sb)->s_es;
@@ -519,7 +514,13 @@
unlock_super (sb);
return bitmap_count;
#else
- return le32_to_cpu(EXT2_SB(sb)->s_es->s_free_blocks_count);
+ for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+ desc = ext2_get_group_desc (sb, i, NULL);
+ if (!desc)
+ continue;
+ desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+ }
+ return desc_count;
#endif
}

diff -uNr linux/fs/ext2/ialloc.c edited/fs/ext2/ialloc.c
--- linux/fs/ext2/ialloc.c Fri Mar 14 01:53:36 2003
+++ edited/fs/ext2/ialloc.c Thu Mar 13 20:08:58 2003
@@ -278,7 +278,8 @@
int ngroups = sbi->s_groups_count;
int inodes_per_group = EXT2_INODES_PER_GROUP(sb);
int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups;
- int avefreeb = le32_to_cpu(es->s_free_blocks_count) / ngroups;
+ int free_blocks = ext2_count_free_blocks(sb);
+ int avefreeb = free_blocks / ngroups;
int blocks_per_dir;
int ndirs = sbi->s_dir_count;
int max_debt, max_dirs, min_blocks, min_inodes;
@@ -320,8 +321,7 @@
goto fallback;
}

- blocks_per_dir = (le32_to_cpu(es->s_blocks_count) -
- le32_to_cpu(es->s_free_blocks_count)) / ndirs;
+ blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - free_blocks) / ndirs;

max_dirs = ndirs / ngroups + inodes_per_group / 16;
min_inodes = avefreei - inodes_per_group / 4;
@@ -340,7 +340,7 @@
desc = ext2_get_group_desc (sb, group, &bh);
if (!desc || !desc->bg_free_inodes_count)
continue;
- if (sbi->s_debts[group] >= max_debt)
+ if (sbi->s_bgi[group].debts >= max_debt)
continue;
if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
continue;
@@ -501,11 +501,11 @@
cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);

if (S_ISDIR(mode)) {
- if (EXT2_SB(sb)->s_debts[group] < 255)
- EXT2_SB(sb)->s_debts[group]++;
+ if (EXT2_SB(sb)->s_bgi[group].debts < 255)
+ EXT2_SB(sb)->s_bgi[group].debts++;
} else {
- if (EXT2_SB(sb)->s_debts[group])
- EXT2_SB(sb)->s_debts[group]--;
+ if (EXT2_SB(sb)->s_bgi[group].debts)
+ EXT2_SB(sb)->s_bgi[group].debts--;
}

mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
diff -uNr linux/fs/ext2/super.c edited/fs/ext2/super.c
--- linux/fs/ext2/super.c Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/super.c Fri Mar 14 10:10:09 2003
@@ -141,7 +141,7 @@
if (sbi->s_group_desc[i])
brelse (sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
- kfree(sbi->s_debts);
+ kfree(sbi->s_bgi);
brelse (sbi->s_sbh);
sb->s_fs_info = NULL;
kfree(sbi);
@@ -464,8 +464,11 @@
int i;
int desc_block = 0;
struct ext2_sb_info *sbi = EXT2_SB(sb);
- unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+ struct ext2_super_block * es = sbi->s_es;
+ unsigned long block = le32_to_cpu(es->s_first_data_block);
struct ext2_group_desc * gdp = NULL;
+ unsigned int total_free = 0, free;
+ unsigned int reserved = le32_to_cpu(es->s_r_blocks_count);

ext2_debug ("Checking group descriptors");

@@ -504,6 +507,27 @@
block += EXT2_BLOCKS_PER_GROUP(sb);
gdp++;
}
+
+ /* restore free blocks counter in SB -bzzz */
+ es->s_free_blocks_count = total_free = ext2_count_free_blocks(sb);
+
+ /* distribute reserved blocks over groups -bzzz */
+ for(i = sbi->s_groups_count-1; reserved && total_free && i >= 0; i--) {
+ gdp = ext2_get_group_desc (sb, i, NULL);
+ if (!gdp) {
+ ext2_error (sb, "ext2_check_descriptors",
+ "cant get descriptor for group %d", i);
+ return 0;
+ }
+
+ free = le16_to_cpu(gdp->bg_free_blocks_count);
+ if (free > reserved)
+ free = reserved;
+ sbi->s_bgi[i].reserved = free;
+ reserved -= free;
+ total_free -= free;
+ }
+
return 1;
}

@@ -768,13 +792,17 @@
printk ("EXT2-fs: not enough memory\n");
goto failed_mount;
}
- sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
+ sbi->s_bgi = kmalloc(sbi->s_groups_count*sizeof(struct ext2_bg_info),
GFP_KERNEL);
- if (!sbi->s_debts) {
+ if (!sbi->s_bgi) {
printk ("EXT2-fs: not enough memory\n");
goto failed_mount_group_desc;
}
- memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(*sbi->s_debts));
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ sbi->s_bgi[i].debts = 0;
+ sbi->s_bgi[i].reserved = 0;
+ spin_lock_init(&sbi->s_bgi[i].balloc_lock);
+ }
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logic_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
@@ -820,8 +848,8 @@
brelse(sbi->s_group_desc[i]);
failed_mount_group_desc:
kfree(sbi->s_group_desc);
- if (sbi->s_debts)
- kfree(sbi->s_debts);
+ if (sbi->s_bgi)
+ kfree(sbi->s_bgi);
failed_mount:
brelse(bh);
failed_sbi:
@@ -840,6 +868,7 @@

static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
{
+ es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
es->s_wtime = cpu_to_le32(get_seconds());
mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
@@ -868,6 +897,7 @@
ext2_debug ("setting valid to 0\n");
es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) &
~EXT2_VALID_FS);
+ es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
es->s_mtime = cpu_to_le32(get_seconds());
ext2_sync_super(sb, es);
} else
@@ -929,7 +959,8 @@
static int ext2_statfs (struct super_block * sb, struct statfs * buf)
{
struct ext2_sb_info *sbi = EXT2_SB(sb);
- unsigned long overhead;
+ unsigned long overhead, total_free = 0;
+ struct ext2_group_desc *desc;
int i;

if (test_opt (sb, MINIX_DF))
@@ -950,9 +981,14 @@
* block group descriptors. If the sparse superblocks
* feature is turned on, then not all groups have this.
*/
- for (i = 0; i < sbi->s_groups_count; i++)
+ for (i = 0; i < sbi->s_groups_count; i++) {
overhead += ext2_bg_has_super(sb, i) +
ext2_bg_num_gdb(sb, i);
+
+ /* sum total free blocks -bzzz */
+ desc = ext2_get_group_desc (sb, i, NULL);
+ total_free += le16_to_cpu(desc->bg_free_blocks_count);
+ }

/*
* Every block group has an inode bitmap, a block
@@ -965,7 +1001,7 @@
buf->f_type = EXT2_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = le32_to_cpu(sbi->s_es->s_blocks_count) - overhead;
- buf->f_bfree = ext2_count_free_blocks (sb);
+ buf->f_bfree = total_free;
buf->f_bavail = buf->f_bfree - le32_to_cpu(sbi->s_es->s_r_blocks_count);
if (buf->f_bfree < le32_to_cpu(sbi->s_es->s_r_blocks_count))
buf->f_bavail = 0;
diff -uNr linux/include/asm-alpha/bitops.h edited/include/asm-alpha/bitops.h
--- linux/include/asm-alpha/bitops.h Fri Mar 14 01:53:36 2003
+++ edited/include/asm-alpha/bitops.h Thu Mar 13 14:10:18 2003
@@ -487,7 +487,9 @@


#define ext2_set_bit __test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a)
#define ext2_clear_bit __test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
#define ext2_test_bit test_bit
#define ext2_find_first_zero_bit find_first_zero_bit
#define ext2_find_next_zero_bit find_next_zero_bit
diff -uNr linux/include/asm-arm/bitops.h edited/include/asm-arm/bitops.h
--- linux/include/asm-arm/bitops.h Fri Mar 14 01:53:36 2003
+++ edited/include/asm-arm/bitops.h Thu Mar 13 14:10:46 2003
@@ -357,8 +357,12 @@
*/
#define ext2_set_bit(nr,p) \
__test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
+#define ext2_set_bit_atomic(lock,nr,p) \
+ test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
#define ext2_clear_bit(nr,p) \
__test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
+#define ext2_clear_bit_atomic(lock,nr,p) \
+ test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
#define ext2_test_bit(nr,p) \
__test_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
#define ext2_find_first_zero_bit(p,sz) \
diff -uNr linux/include/asm-cris/bitops.h edited/include/asm-cris/bitops.h
--- linux/include/asm-cris/bitops.h Mon Nov 11 06:28:30 2002
+++ edited/include/asm-cris/bitops.h Thu Mar 13 14:11:15 2003
@@ -360,7 +360,9 @@
#define hweight8(x) generic_hweight8(x)

#define ext2_set_bit test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a)
#define ext2_clear_bit test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
#define ext2_test_bit test_bit
#define ext2_find_first_zero_bit find_first_zero_bit
#define ext2_find_next_zero_bit find_next_zero_bit
diff -uNr linux/include/asm-i386/bitops.h edited/include/asm-i386/bitops.h
--- linux/include/asm-i386/bitops.h Wed Dec 25 06:03:08 2002
+++ edited/include/asm-i386/bitops.h Thu Mar 13 14:11:32 2003
@@ -479,8 +479,12 @@

#define ext2_set_bit(nr,addr) \
__test_and_set_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock,nr,addr) \
+ test_and_set_bit((nr),(unsigned long*)addr)
#define ext2_clear_bit(nr, addr) \
__test_and_clear_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock,nr, addr) \
+ test_and_clear_bit((nr),(unsigned long*)addr)
#define ext2_test_bit(nr, addr) test_bit((nr),(unsigned long*)addr)
#define ext2_find_first_zero_bit(addr, size) \
find_first_zero_bit((unsigned long*)addr, size)
diff -uNr linux/include/asm-ia64/bitops.h edited/include/asm-ia64/bitops.h
--- linux/include/asm-ia64/bitops.h Thu Feb 20 16:18:21 2003
+++ edited/include/asm-ia64/bitops.h Thu Mar 13 14:12:50 2003
@@ -453,7 +453,9 @@
#define __clear_bit(nr, addr) clear_bit(nr, addr)

#define ext2_set_bit test_and_set_bit
+#define ext2_set_atomic(l,n,a) test_and_set_bit(n,a)
#define ext2_clear_bit test_and_clear_bit
+#define ext2_clear_atomic(l,n,a) test_and_clear_bit(n,a)
#define ext2_test_bit test_bit
#define ext2_find_first_zero_bit find_first_zero_bit
#define ext2_find_next_zero_bit find_next_zero_bit
diff -uNr linux/include/asm-m68k/bitops.h edited/include/asm-m68k/bitops.h
--- linux/include/asm-m68k/bitops.h Mon Nov 11 06:28:33 2002
+++ edited/include/asm-m68k/bitops.h Fri Mar 14 10:00:15 2003
@@ -354,6 +354,16 @@
return retval;
}

+static inline int
+ext2_set_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, vaddr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int
ext2_clear_bit (int nr, volatile void *vaddr)
{
@@ -365,6 +375,16 @@
return retval;
}

+static inline int
+ext2_clear_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, vaddr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int
ext2_test_bit (int nr, const volatile void *vaddr)
{
diff -uNr linux/include/asm-m68knommu/bitops.h edited/include/asm-m68knommu/bitops.h
--- linux/include/asm-m68knommu/bitops.h Mon Nov 11 06:28:04 2002
+++ edited/include/asm-m68knommu/bitops.h Fri Mar 14 10:00:52 2003
@@ -387,6 +387,16 @@
return retval;
}

+static inline int ext2_set_bit_atomic(spinlock_t *lock, int nr,
+ volatile void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int ext2_clear_bit(int nr, volatile void * addr)
{
int mask, retval;
@@ -402,6 +412,16 @@
return retval;
}

+static inline int ext2_clear_bit_atomic(spinlock_t *lock, int nr,
+ volatile void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int ext2_test_bit(int nr, const volatile void * addr)
{
int mask;
diff -uNr linux/include/asm-mips/bitops.h edited/include/asm-mips/bitops.h
--- linux/include/asm-mips/bitops.h Mon Nov 11 06:28:03 2002
+++ edited/include/asm-mips/bitops.h Fri Mar 14 10:01:22 2003
@@ -810,6 +810,15 @@
return retval;
}

+static inline int ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int ext2_clear_bit(int nr, void * addr)
{
int mask, retval, flags;
@@ -824,6 +833,15 @@
return retval;
}

+static inline int ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern __inline__ int ext2_test_bit(int nr, const void * addr)
{
int mask;
@@ -890,7 +908,9 @@

/* Native ext2 byte ordering, just collapse using defines. */
#define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr))
+#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr), (addr))
#define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr), (addr))
#define ext2_test_bit(nr, addr) test_bit((nr), (addr))
#define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size))
#define ext2_find_next_zero_bit(addr, size, offset) \
diff -uNr linux/include/asm-mips64/bitops.h edited/include/asm-mips64/bitops.h
--- linux/include/asm-mips64/bitops.h Mon Nov 11 06:28:29 2002
+++ edited/include/asm-mips64/bitops.h Fri Mar 14 10:01:46 2003
@@ -516,6 +516,16 @@
return retval;
}

+static inline int
+ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern inline int
ext2_clear_bit(int nr, void * addr)
{
@@ -531,6 +541,16 @@
return retval;
}

+static inline int
+ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
extern inline int
ext2_test_bit(int nr, const void * addr)
{
@@ -599,7 +619,9 @@

/* Native ext2 byte ordering, just collapse using defines. */
#define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr))
+#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr), (addr))
#define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr), (addr))
#define ext2_test_bit(nr, addr) test_bit((nr), (addr))
#define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size))
#define ext2_find_next_zero_bit(addr, size, offset) \
diff -uNr linux/include/asm-parisc/bitops.h edited/include/asm-parisc/bitops.h
--- linux/include/asm-parisc/bitops.h Thu Feb 20 16:18:21 2003
+++ edited/include/asm-parisc/bitops.h Thu Mar 13 14:29:47 2003
@@ -389,10 +389,14 @@
*/
#ifdef __LP64__
#define ext2_set_bit(nr, addr) test_and_set_bit((nr) ^ 0x38, addr)
+#define ext2_set_bit_atomic(l,nr,addr) test_and_set_bit((nr) ^ 0x38, addr)
#define ext2_clear_bit(nr, addr) test_and_clear_bit((nr) ^ 0x38, addr)
+#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x38, addr)
#else
#define ext2_set_bit(nr, addr) test_and_set_bit((nr) ^ 0x18, addr)
+#define ext2_set_bit_atomic(l,nr,addr) test_and_set_bit((nr) ^ 0x18, addr)
#define ext2_clear_bit(nr, addr) test_and_clear_bit((nr) ^ 0x18, addr)
+#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x18, addr)
#endif

#endif /* __KERNEL__ */
diff -uNr linux/include/asm-ppc/bitops.h edited/include/asm-ppc/bitops.h
--- linux/include/asm-ppc/bitops.h Mon Jan 20 05:23:05 2003
+++ edited/include/asm-ppc/bitops.h Thu Mar 13 14:31:00 2003
@@ -392,7 +392,9 @@


#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
+#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))

static __inline__ int ext2_test_bit(int nr, __const__ void * addr)
{
diff -uNr linux/include/asm-ppc64/bitops.h edited/include/asm-ppc64/bitops.h
--- linux/include/asm-ppc64/bitops.h Mon Nov 11 06:28:28 2002
+++ edited/include/asm-ppc64/bitops.h Thu Mar 13 14:32:23 2003
@@ -336,8 +336,12 @@

#define ext2_set_bit(nr,addr) \
__test_and_set_le_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock, nr,addr) \
+ test_and_set_le_bit((nr),(unsigned long*)addr)
#define ext2_clear_bit(nr, addr) \
__test_and_clear_le_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+ test_and_clear_le_bit((nr),(unsigned long*)addr)
#define ext2_test_bit(nr, addr) test_le_bit((nr),(unsigned long*)addr)
#define ext2_find_first_zero_bit(addr, size) \
find_first_zero_le_bit((unsigned long*)addr, size)
diff -uNr linux/include/asm-s390/bitops.h edited/include/asm-s390/bitops.h
--- linux/include/asm-s390/bitops.h Fri Mar 14 01:53:27 2003
+++ edited/include/asm-s390/bitops.h Thu Mar 13 14:33:55 2003
@@ -805,8 +805,12 @@

#define ext2_set_bit(nr, addr) \
test_and_set_bit((nr)^24, (unsigned long *)addr)
+#define ext2_set_bit_atomic(lock, nr, addr) \
+ test_and_set_bit((nr)^24, (unsigned long *)addr)
#define ext2_clear_bit(nr, addr) \
test_and_clear_bit((nr)^24, (unsigned long *)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+ test_and_clear_bit((nr)^24, (unsigned long *)addr)
#define ext2_test_bit(nr, addr) \
test_bit((nr)^24, (unsigned long *)addr)

diff -uNr linux/include/asm-s390x/bitops.h edited/include/asm-s390x/bitops.h
--- linux/include/asm-s390x/bitops.h Fri Mar 14 01:53:27 2003
+++ edited/include/asm-s390x/bitops.h Thu Mar 13 14:35:22 2003
@@ -838,8 +838,12 @@

#define ext2_set_bit(nr, addr) \
test_and_set_bit((nr)^56, (unsigned long *)addr)
+#define ext2_set_bit_atomic(lock, nr, addr) \
+ test_and_set_bit((nr)^56, (unsigned long *)addr)
#define ext2_clear_bit(nr, addr) \
test_and_clear_bit((nr)^56, (unsigned long *)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+ test_and_clear_bit((nr)^56, (unsigned long *)addr)
#define ext2_test_bit(nr, addr) \
test_bit((nr)^56, (unsigned long *)addr)

diff -uNr linux/include/asm-sh/bitops.h edited/include/asm-sh/bitops.h
--- linux/include/asm-sh/bitops.h Mon Nov 11 06:28:02 2002
+++ edited/include/asm-sh/bitops.h Fri Mar 14 10:03:08 2003
@@ -265,6 +265,16 @@
return retval;
}

+static inline int ext2_set_bit_atomic(spinlock_t *lock,
+ int nr, volatile void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_set_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
static __inline__ int ext2_clear_bit(int nr, volatile void * addr)
{
int mask, retval;
@@ -280,6 +290,16 @@
return retval;
}

+static inline int ext2_clear_bit_atomic(spinlock_t *lock,
+ int nr, volatile void * addr)
+{
+ int ret;
+ spin_lock(lock);
+ ret = ext2_clear_bit(nr, addr);
+ spin_unlock(lock);
+ return ret;
+}
+
static __inline__ int ext2_test_bit(int nr, const volatile void * addr)
{
int mask;
diff -uNr linux/include/asm-sparc/bitops.h edited/include/asm-sparc/bitops.h
--- linux/include/asm-sparc/bitops.h Mon Jan 20 05:23:05 2003
+++ edited/include/asm-sparc/bitops.h Thu Mar 13 14:38:54 2003
@@ -454,7 +454,9 @@
find_next_zero_le_bit((addr), (size), 0)

#define ext2_set_bit __test_and_set_le_bit
+#define ext2_set_bit_atomic(l,n,a) test_and_set_le_bit(n,a)
#define ext2_clear_bit __test_and_clear_le_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_le_bit(n,a)
#define ext2_test_bit test_le_bit
#define ext2_find_first_zero_bit find_first_zero_le_bit
#define ext2_find_next_zero_bit find_next_zero_le_bit
diff -uNr linux/include/asm-sparc64/bitops.h edited/include/asm-sparc64/bitops.h
--- linux/include/asm-sparc64/bitops.h Mon Nov 11 06:28:05 2002
+++ edited/include/asm-sparc64/bitops.h Thu Mar 13 14:43:49 2003
@@ -351,7 +351,9 @@
#ifdef __KERNEL__

#define ext2_set_bit(nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr))
+#define ext2_set_bit_atomic(lock,nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr))
#define ext2_clear_bit(nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock,nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr))
#define ext2_test_bit(nr,addr) test_le_bit((nr),(unsigned long *)(addr))
#define ext2_find_first_zero_bit(addr, size) \
find_first_zero_le_bit((unsigned long *)(addr), (size))
diff -uNr linux/include/asm-v850/bitops.h edited/include/asm-v850/bitops.h
--- linux/include/asm-v850/bitops.h Mon Nov 11 06:28:02 2002
+++ edited/include/asm-v850/bitops.h Thu Mar 13 14:44:48 2003
@@ -252,7 +252,9 @@
#define hweight8(x) generic_hweight8 (x)

#define ext2_set_bit test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a) test_and_set_bit(n,a)
#define ext2_clear_bit test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
#define ext2_test_bit test_bit
#define ext2_find_first_zero_bit find_first_zero_bit
#define ext2_find_next_zero_bit find_next_zero_bit
diff -uNr linux/include/asm-x86_64/bitops.h edited/include/asm-x86_64/bitops.h
--- linux/include/asm-x86_64/bitops.h Fri Mar 14 01:53:27 2003
+++ edited/include/asm-x86_64/bitops.h Thu Mar 13 14:45:56 2003
@@ -487,8 +487,12 @@

#define ext2_set_bit(nr,addr) \
__test_and_set_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock,nr,addr) \
+ test_and_set_bit((nr),(unsigned long*)addr)
#define ext2_clear_bit(nr, addr) \
__test_and_clear_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock,nr,addr) \
+ test_and_clear_bit((nr),(unsigned long*)addr)
#define ext2_test_bit(nr, addr) test_bit((nr),(unsigned long*)addr)
#define ext2_find_first_zero_bit(addr, size) \
find_first_zero_bit((unsigned long*)addr, size)
diff -uNr linux/include/linux/ext2_fs_sb.h edited/include/linux/ext2_fs_sb.h
--- linux/include/linux/ext2_fs_sb.h Mon Nov 11 06:28:30 2002
+++ edited/include/linux/ext2_fs_sb.h Fri Mar 14 09:51:10 2003
@@ -16,6 +16,12 @@
#ifndef _LINUX_EXT2_FS_SB
#define _LINUX_EXT2_FS_SB

+struct ext2_bg_info {
+ u8 debts;
+ spinlock_t balloc_lock;
+ unsigned int reserved;
+} ____cacheline_aligned_in_smp;
+
/*
* second extended-fs super-block data in memory
*/
@@ -44,7 +50,7 @@
int s_first_ino;
u32 s_next_generation;
unsigned long s_dir_count;
- u8 *s_debts;
+ struct ext2_bg_info *s_bgi;
};

#endif /* _LINUX_EXT2_FS_SB */


2003-03-14 18:24:53

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

> First of all, thanks for this work, Alex. It's been a long time in coming.
>
> One thing I would wonder about is whether we should be implementing this in
> ext2, or in ext3 only. One of the decisions we made in the past is that we
> shouldn't necessarily implement everything in ext2 (especially features that
> complicated the code, and are only useful on high-end systems).
>
> There was a desire to keep ext2 small and simple, and ext3 would get the
> fancy high-end features that make sense if you have a large filesystem
> that you would likely be using in conjunction with ext3 anyways.

Errrm ... if you want to start advocating that sort of thing, I suggest
you make ext3 usable on high end systems first. At the moment, that makes
no sense whatsoever. Ext3 still doesn't scale to big systems.

M.


2003-03-14 19:15:44

by Daniel Phillips

[permalink] [raw]
Subject: Re: [Ext2-devel] Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

On Fri 14 Mar 03 00:56, Andreas Dilger wrote:
> There was a desire to keep ext2 small and simple, and ext3 would get the
> fancy high-end features that make sense if you have a large filesystem
> that you would likely be using in conjunction with ext3 anyways.
>
> It does make sense to test this out on ext2 since it is definitely easier
> to code for ext2 than ext3, and the journaling doesn't skew the performance
> so much. Of course one of the reasons that ext2 is easier to code for is
> exactly _because_ we don't put all of the features into ext2...
>
> Comments on the code inline below...

Ext3 is getting to the point, or has already gotten to the point, where it's
so reliable that it's reasonable to call it Linux's new native filesystem. At
this point, Ext2 can become more of a crucible for new techniques, hopefully,
techniques that simplify things, shorten up data paths, clarify the code,
make it more parallel and so on. For example, I can't help thinking that's
there's some fundamental improvement possible to the truncate path (hmm, I
wonder if I'm giving Alex new ideas...) and that proving such a thing out in
Ext2 first would make a whole lot of sense.

I do intend to pick up the Ext2 HTree patch again in due course and attempt
some simplification of it, as well as working on the outstanding
optimizations, i.e., improved inode allocation and delete coalescing. HTree
is an example of a feature that adds a few K of code, but in my opinion it's
worth it in order to match up better with the Ext3 feature set. Besides,
Ext2 is still quite attractive as a host filesystem for NFS export, and would
be still more attractive with the directory index.

(By the way, on the HTree simplification front, there's a whole lot of
forward declaration cruft that can go away as soon as CONFIG_EXT3_INDEX
is declared to be always on.)

So anyway, the point you were making and that I agree with, is that Ext2 is
growing into the role of experimental filesystem; Ext3 is now the stable
filesystem. Hopefully, the experiments will make Ext2 smaller, cleaner and
at the same time, more powerful, over time. Sort of like the role that RAMFS
plays: besides being useful, Ext2 should be thought of as a showcase for best
filesystem coding practices.

Regards,

Daniel

2003-03-14 19:45:04

by Andrew Morton

[permalink] [raw]
Subject: Re: [Ext2-devel] Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

Daniel Phillips <[email protected]> wrote:
>
> Ext2 should be thought of as a showcase for best
> filesystem coding practices.

Yes. It is the reference block-backed filesystem for the VFS and VM API. If
a feature is added to core kernel, ext2 gets to use it first, and ext2 is the
place to look to see "how is it done".

Arguably, minixfs should be playing that role, and it is close. But it is
now missing a few things.

ext2 is also scarily quick.

2003-03-14 20:48:57

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

On Mar 14, 2003 10:20 +0300, Alex Tomas wrote:
> AD> I would suggest just hooking the ext2_group_desc (and the
> AD> buffer_head in which it lives) off of the ext2_bg_info array
> AD> instead of passing both around explicitly. Since we have
> AD> ext2_bg_info as a group_number-indexed array already, this would
> AD> essentially mean that wherever we call ext2_get_group_desc() we
> AD> could just use sbi->bgi[group].desc (or make
> AD> ext2_get_group_desc() do that, if we don't need it to populate
> AD> bgi[group].desc in the first place).
>
> it make sense. what about to make it by separate patch?
> just to prevent huge concurrent-balloc.diff

Could you make it a pre-requisite to the concurrent-alloc patch? That
would make it a shoo-in to being accepted (cleans up code nicely).

> >> + if (free_blocks < root_blocks + count &&
> >> !capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid &&
> >> + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + /* + *
> >> We are too close to reserve and we are not privileged. + * Can we
> >> allocate anything at all? + */ + if (free_blocks > root_blocks) +
> >> count = free_blocks - root_blocks; + else { +
> >> spin_unlock(&bgi->alloc_lock); + return 0; + }

Argh, please try not to wrap code...

> AD> Per my other email, if we want to handle large files properly by
> AD> allowing them to fill the entire group, yet we want to keep the
> AD> "reserved blocks" count correct, we could always grab the lock on
> AD> the last group and add reserved blocks there. Or, we could just
> AD> ignore the reserved blocks count entirely.
>
> hmm. looks I miss something here. reservation is protected by the lock.
> what's the problem?

So, what Andrew had complained about with the per-group reservation is
that it leaves "gaps" in the allocation of large files. Small gaps,
which IMHO aren't so critical, but whatever. So to avoid having gaps
in the allocation of large files you could additionally allow allocations
from the "reserved pool" of the group in the cases like:

(inode->i_blocks >> (inode->i_blkbits - 9)) > sbi->s_blocks_per_group / 2

If we want to preserve the total reserved blocks count (in the case where
the above test is the only reason we can allocate these blocks), we can
shift any reserved blocks we are "stealing" from this group into the last
group.

> AD> I'm not sure whether I agree with this or not... If a group ever
> AD> exceeds the reserved mark for some reason (e.g. full filesystem)
> AD> it will never be able to "improve itself" back to a decent amount
> AD> of reserved blocks. That said, you may want to only reduce
> AD> "reserved" by "free" in the end, so that the total amount of
> AD> reserved blocks is kept constant. need to re-calculate
> AD> "per_group" for each loop).
>
> well, I believe reserved blocks may be really _reserved_ at the end of
> the fs. simple because of nobody should use them until fs is almost full.

The point of having the reserved blocks is to reduce fragmentation
in file allocation. Having per-group reserved blocks is a good
idea, because it keeps the reserved "slack" per group, and helps file
allocations within that group have a bit of free space in which to grow.
If you are reserving all of the blocks at the end of the filesystem,
then the earlier groups will become 100% allocated prematurely and lose
any ability to keep files there contiguous.

What I was disagreeing with was reducing a groups reserved count because
it currently exceeds the per_group reserved count. That's like saying
"the filesystem is 99% full, reduce the total reserved count to 1%".
Even if a group _currently_ exceeds the reserved limit, we should keep
the reserved limit for that group as-is, and hopefully allow it to grow
more "slack" for future allocation improvement if files are deleted.

If we are concerned about the total reserved blocks count (which I
personally am not), we can always add the shortfall in reserved blocks
for the current group to the remaining groups without reducing the
current group's reserved limit.

> {
> - struct ext2_sb_info * sbi = EXT2_SB(sb);
> - struct ext2_super_block * es = sbi->s_es;
> - unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
> - unsigned root_blocks = le32_to_cpu(es->s_r_blocks_count);
> + unsigned free_blocks;
> + unsigned root_blocks;
>
> + spin_lock(&bgi->balloc_lock);
> +
> + free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
> if (free_blocks < count)
> count = free_blocks;
> + root_blocks = bgi->reserved;

> >> + root_blocks = bgi->reserved;
>
> AD> I would avoid calling this "root_blocks" and instead just use
> AD> "bgi->reserved" or "reserved_blocks" everywhere. The original
> AD> intent of these blocks was to reduce fragmentation and not
> AD> necessarily reserved-for-root.
>
> fixed

??

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/

2003-03-14 21:11:51

by Alex Tomas

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

>>>>> Andreas Dilger (AD) writes:

AD> Could you make it a pre-requisite to the concurrent-alloc patch?
AD> That would make it a shoo-in to being accepted (cleans up code
AD> nicely).

Andrew already asked to wait until next -mm

AD> The point of having the reserved blocks is to reduce
AD> fragmentation in file allocation. Having per-group reserved
AD> blocks is a good idea, because it keeps the reserved "slack" per
AD> group, and helps file allocations within that group have a bit of
AD> free space in which to grow. If you are reserving all of the
AD> blocks at the end of the filesystem, then the earlier groups will
AD> become 100% allocated prematurely and lose any ability to keep
AD> files there contiguous.

well. looks like I miss something here. I thought reservation is not
allocation policy, but mechanism to protect some user (root, usually)
from fs overflow

2003-03-15 04:27:20

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

On Fri, Mar 14, 2003 at 10:20:24AM +0300, Alex Tomas wrote:
> and corrected patch:

This patch is a godsend. Whoever's listening, please apply!

dbench on 32x/48G NUMA-Q, aic7xxx adapter, pbay disk, 32K PAGE_SIZE
(pgcl was used for benchmark feasibility purposes)

throughput:
----------
before:
Throughput 61.5376 MB/sec 512 procs
dbench 512 637.21s user 15739.41s system 565% cpu 48:16.28 total

after:
Throughput 104.074 MB/sec 512 procs
(GRR, didn't do time, took ca. 30 minutes)

profile:
--------
before:
vma samples %-age symbol name
c0106ff4 160824916 45.1855 default_idle
c01dbfd0 49993575 14.0462 __copy_to_user_ll
c01dc038 15474349 4.34768 __copy_from_user_ll
c0108140 13603867 3.82215 .text.lock.semaphore
c0119058 10872716 3.0548 try_to_wake_up
c02647f0 7896052 2.21848 sync_buffer
c011a1bc 7539112 2.11819 schedule
c0119dac 7168574 2.01409 scheduler_tick
c011fadc 6053745 1.70086 profile_hook
c0119860 4759523 1.33724 load_balance
c0107d0c 4472105 1.25649 __down
c011c4ff 4159010 1.16852 .text.lock.sched
c013dd28 3026705 0.850385 .text.lock.vmscan
c013ece4 3016788 0.847599 check_highmem_ptes
c0113590 2406329 0.676084 mark_offset_tsc
c02649c0 2210485 0.621059 add_event_entry
c010f6b8 2195748 0.616919 timer_interrupt
c0133118 1696204 0.476566 find_get_page

after:
vma samples %-age symbol name
c0106ff4 52751908 30.8696 default_idle
c01dc3b0 28988721 16.9637 __copy_to_user_ll
c01dc418 8240854 4.82242 __copy_from_user_ll
c011e472 8044716 4.70764 .text.lock.fork
c0264bd0 5666004 3.31566 sync_buffer
c013dd28 4454362 2.60662 .text.lock.vmscan
c0119058 4291999 2.51161 try_to_wake_up
c0119dac 4055412 2.37316 scheduler_tick
c011fadc 3554019 2.07976 profile_hook
c011a1bc 2866025 1.67715 schedule
c0119860 2637644 1.54351 load_balance
c0108140 2433644 1.42413 .text.lock.semaphore
c0264da0 1406704 0.823181 add_event_entry
c011c9a4 1370708 0.802117 prepare_to_wait
c0185e20 1236390 0.723516 ext2_new_block
c011c4ff 1227452 0.718285 .text.lock.sched
c013ece4 1148317 0.671977 check_highmem_ptes
c0113590 1145881 0.670551 mark_offset_tsc


vmstat (short excerpt, edited for readability):
------
before:
procs -----------memory---------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
12 5 38747168 484672 9049088 20 4032 1171 13148 1 22 65 12
11 11 38767264 479168 9034304 20 2908 1180 13077 1 28 52 19
9 14 38764256 480000 9036512 24 1920 1164 13940 1 23 51 25
7 7 38764128 480832 9035360 12 4444 1191 13784 1 24 51 24
9 5 38764512 481664 9033024 16 2924 1220 13853 1 23 66 10
9 6 38762208 482816 9035904 0 3404 1186 13686 1 25 62 12

after:
procs -----------memory---------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
60 11 38659840 533920 9226720 100 1672 2760 1853 5 66 11 18
31 23 38565472 531264 9320384 240 1020 1195 1679 2 35 37 26
23 23 38384928 521952 9503104 772 3372 5624 5093 2 62 9 27
24 31 37945664 518080 9916448 1536 5808 10449 13484 1 45 13 41
31 86 37755072 516096 10091104 1040 1916 3672 9744 2 51 15 32
24 30 37644352 512864 10192960 900 1612 3184 8414 2 49 12 36

There's a lot of odd things going on in both of the vmstat logs.


I've also collected logs of top slab consumers every 10s and full
dbench output for both runs, if that's interesting to anyone.


-- wli

2003-03-15 04:44:46

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

William Lee Irwin III <[email protected]> wrote:
>
> On Fri, Mar 14, 2003 at 10:20:24AM +0300, Alex Tomas wrote:
> > and corrected patch:
>
> This patch is a godsend. Whoever's listening, please apply!
>
> dbench on 32x/48G NUMA-Q, aic7xxx adapter, pbay disk, 32K PAGE_SIZE
> (pgcl was used for benchmark feasibility purposes)
>
> throughput:
> ----------
> before:
> Throughput 61.5376 MB/sec 512 procs
> dbench 512 637.21s user 15739.41s system 565% cpu 48:16.28 total
>
> after:
> Throughput 104.074 MB/sec 512 procs
> (GRR, didn't do time, took ca. 30 minutes)

`dbench 512' will presumably do lots of IO and spend significant
time in I/O wait. You should see the effects of this change more
if you use fewer clients (say, 32) so it doesn't hit disk.

On quad power4, dbench 32:

Unpatched:

Throughput 334.372 MB/sec (NB=417.965 MB/sec 3343.72 MBit/sec)
Throughput 331.379 MB/sec (NB=414.224 MB/sec 3313.79 MBit/sec)
Throughput 364.151 MB/sec (NB=455.189 MB/sec 3641.51 MBit/sec)
Throughput 333.066 MB/sec (NB=416.332 MB/sec 3330.66 MBit/sec)
Throughput 365.335 MB/sec (NB=456.669 MB/sec 3653.35 MBit/sec)
Throughput 335.523 MB/sec (NB=419.404 MB/sec 3355.23 MBit/sec)
Throughput 334.457 MB/sec (NB=418.071 MB/sec 3344.57 MBit/sec)
Throughput 329.527 MB/sec (NB=411.909 MB/sec 3295.27 MBit/sec)
Throughput 332.721 MB/sec (NB=415.901 MB/sec 3327.21 MBit/sec)
Throughput 328.735 MB/sec (NB=410.919 MB/sec 3287.35 MBit/sec)

patched:

Throughput 335.262 MB/sec (NB=419.078 MB/sec 3352.62 MBit/sec)
Throughput 334.531 MB/sec (NB=418.163 MB/sec 3345.31 MBit/sec)
Throughput 337.366 MB/sec (NB=421.707 MB/sec 3373.66 MBit/sec)
Throughput 334.504 MB/sec (NB=418.13 MB/sec 3345.04 MBit/sec)
Throughput 332.482 MB/sec (NB=415.602 MB/sec 3324.82 MBit/sec)
Throughput 334.69 MB/sec (NB=418.363 MB/sec 3346.9 MBit/sec)
Throughput 370.14 MB/sec (NB=462.675 MB/sec 3701.4 MBit/sec)
Throughput 333.255 MB/sec (NB=416.569 MB/sec 3332.55 MBit/sec)
Throughput 336.065 MB/sec (NB=420.081 MB/sec 3360.65 MBit/sec)
Throughput 334.328 MB/sec (NB=417.91 MB/sec 3343.28 MBit/sec)

No difference at all.

On the quad Xeon (after increasing dirty_ratio and dirty_background_ratio so
I/O was negligible) I was able to measure a 1.5% improvement.

I worry about the hardware you're using there.

> profile:
> --------
>
> ...
> after:
> vma samples %-age symbol name
> c0106ff4 52751908 30.8696 default_idle
> c01dc3b0 28988721 16.9637 __copy_to_user_ll
> c01dc418 8240854 4.82242 __copy_from_user_ll
> c011e472 8044716 4.70764 .text.lock.fork
> c0264bd0 5666004 3.31566 sync_buffer
> c013dd28 4454362 2.60662 .text.lock.vmscan
> c0119058 4291999 2.51161 try_to_wake_up
> c0119dac 4055412 2.37316 scheduler_tick
> c011fadc 3554019 2.07976 profile_hook
> c011a1bc 2866025 1.67715 schedule
> c0119860 2637644 1.54351 load_balance
> c0108140 2433644 1.42413 .text.lock.semaphore
> c0264da0 1406704 0.823181 add_event_entry
> c011c9a4 1370708 0.802117 prepare_to_wait
> c0185e20 1236390 0.723516 ext2_new_block
> c011c4ff 1227452 0.718285 .text.lock.sched
> c013ece4 1148317 0.671977 check_highmem_ptes
> c0113590 1145881 0.670551 mark_offset_tsc

Lots of idle time. Try it with a smaller client count, get the I/O out of
the picture.

>
> vmstat (short excerpt, edited for readability):

With what interval?

> after:
> procs -----------memory---------- -----io---- --system-- ----cpu----
> r b free buff cache bi bo in cs us sy id wa
> 60 11 38659840 533920 9226720 100 1672 2760 1853 5 66 11 18
> 31 23 38565472 531264 9320384 240 1020 1195 1679 2 35 37 26
> 23 23 38384928 521952 9503104 772 3372 5624 5093 2 62 9 27
> 24 31 37945664 518080 9916448 1536 5808 10449 13484 1 45 13 41
> 31 86 37755072 516096 10091104 1040 1916 3672 9744 2 51 15 32
> 24 30 37644352 512864 10192960 900 1612 3184 8414 2 49 12 36
>
> There's a lot of odd things going on in both of the vmstat logs.

Where are all those interrupts coming from?


2003-03-15 05:20:27

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

William Lee Irwin III <[email protected]> wrote:
>> dbench on 32x/48G NUMA-Q, aic7xxx adapter, pbay disk, 32K PAGE_SIZE
>> (pgcl was used for benchmark feasibility purposes)
>> throughput:
>> ----------
>> before:
>> Throughput 61.5376 MB/sec 512 procs
>> dbench 512 637.21s user 15739.41s system 565% cpu 48:16.28 total
>> after:
>> Throughput 104.074 MB/sec 512 procs
>> (GRR, didn't do time, took ca. 30 minutes)

On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> `dbench 512' will presumably do lots of IO and spend significant
> time in I/O wait. You should see the effects of this change more
> if you use fewer clients (say, 32) so it doesn't hit disk.
> On quad power4, dbench 32:

Hmm. I'm just trying to spawn enough tasks to keep the cpus busy to get
a large enough thread pool to have something to run when someone sleeps.
There's enough idle time now that this sounds like the wrong direction
to move the task count in...


On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> Unpatched:
> Throughput 334.372 MB/sec (NB=417.965 MB/sec 3343.72 MBit/sec)
[...]
> patched:
> Throughput 335.262 MB/sec (NB=419.078 MB/sec 3352.62 MBit/sec)
[...]
> No difference at all.
> On the quad Xeon (after increasing dirty_ratio and dirty_background_ratio so
> I/O was negligible) I was able to measure a 1.5% improvement.
> I worry about the hardware you're using there.

Why? The adapter is "vaguely modern" (actually acquired as part of a
hunt for an HBA w/a less buggy driver) but the box and disks and so on
are still pretty ancient, so the absolute numbers aren't useful.

To get a real comparison we'd have to compare spindles, HBA's, and
cpus, and attempt to factor them out. The disks are actually only
capable of doing 30MB/s or 40MB/s, the buses can only do 40MB/s, and
the cpus are 700MHz P-III's. Where dbench gets its numbers faster than
wirespeed I have no idea...

This locking issue may just need more cpus to bring out.


William Lee Irwin III <[email protected]> wrote:
>> profile:
>> --------
[...]
>> after:
>> vma samples %-age symbol name
>> c0106ff4 52751908 30.8696 default_idle
>> c01dc3b0 28988721 16.9637 __copy_to_user_ll
>> c01dc418 8240854 4.82242 __copy_from_user_ll
>> c011e472 8044716 4.70764 .text.lock.fork
[...]
> Lots of idle time. Try it with a smaller client count, get the I/O out of
> the picture.

I'll have trouble as there won't be enough tasks to keep the cpus busy.
Why do you think reducing the client count gets io out of the picture?
Why do you think reducing the client count will reduce idle time?


William Lee Irwin III <[email protected]> wrote:
>> vmstat (short excerpt, edited for readability):

On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> With what interval?

Sorry, 1s.


William Lee Irwin III <[email protected]> wrote:
>> after:
>> procs -----------memory---------- -----io---- --system-- ----cpu----
>> r b free buff cache bi bo in cs us sy id wa
>> 60 11 38659840 533920 9226720 100 1672 2760 1853 5 66 11 18
>> 31 23 38565472 531264 9320384 240 1020 1195 1679 2 35 37 26
>> 23 23 38384928 521952 9503104 772 3372 5624 5093 2 62 9 27
>> 24 31 37945664 518080 9916448 1536 5808 10449 13484 1 45 13 41
>> 31 86 37755072 516096 10091104 1040 1916 3672 9744 2 51 15 32
>> 24 30 37644352 512864 10192960 900 1612 3184 8414 2 49 12 36
>> There's a lot of odd things going on in both of the vmstat logs.

On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> Where are all those interrupts coming from?

Well, the timer interrupt is a killer. 1KHz*num_cpus_online() blows
goats for sufficiently large num_cpus_online(), but for some reason
things are slower without it. I suspect that scheduling response
time is somehow dependent on it.

I got a hold of an aic7xxx so io throughput is slightly better than my
usual NUMA-Q runs (i.e. oopsen). The disks are still clockwork, though.


-- wli

2003-03-15 05:33:08

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

>> On the quad Xeon (after increasing dirty_ratio and dirty_background_ratio so
>> I/O was negligible) I was able to measure a 1.5% improvement.
>> I worry about the hardware you're using there.
>
> Why? The adapter is "vaguely modern" (actually acquired as part of a
> hunt for an HBA w/a less buggy driver) but the box and disks and so on
> are still pretty ancient, so the absolute numbers aren't useful.
>
> To get a real comparison we'd have to compare spindles, HBA's, and
> cpus, and attempt to factor them out. The disks are actually only
> capable of doing 30MB/s or 40MB/s, the buses can only do 40MB/s, and
> the cpus are 700MHz P-III's. Where dbench gets its numbers faster than
> wirespeed I have no idea...

You'd also have to stop sending all your IO over a NUMA backplane ...

> This locking issue may just need more cpus to bring out.

More than 32 CPUs? Hmmmm.

M.

2003-03-15 05:38:44

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> `dbench 512' will presumably do lots of IO and spend significant
> time in I/O wait. You should see the effects of this change more
> if you use fewer clients (say, 32) so it doesn't hit disk.

Throughput 226.57 MB/sec 32 procs
dbench 32 2>& 1 25.04s user 515.02s system 1069% cpu 50.516 total

vma samples %-age symbol name
c0106ff4 1877599 35.8654 default_idle
c01dc3b0 586997 11.2127 __copy_to_user_ll
c0108140 193213 3.6907 .text.lock.semaphore
c015249a 137467 2.62586 .text.lock.file_table
c01dc418 117981 2.25364 __copy_from_user_ll
c01dc59c 115415 2.20463 .text.lock.dec_and_lock
c016997b 106198 2.02857 .text.lock.dcache
c0119dac 98439 1.88036 scheduler_tick
c01dc510 95745 1.8289 atomic_dec_and_lock
c0119058 91746 1.75251 try_to_wake_up
c011fadc 88996 1.69998 profile_hook
c0107d0c 84514 1.61436 __down
c01522a0 70518 1.34702 file_move
c011a1bc 68364 1.30587 schedule
c011c4ff 59716 1.14068 .text.lock.sched
c0168aac 58337 1.11434 d_lookup
c015f3dc 58111 1.11002 path_lookup
c0119860 55141 1.05329 load_balance

procs -----------memory---------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
11 0 47538048 549664 737120 0 0 1028 12123 2 33 65 0
6 2 47534592 550880 738272 0 16312 1085 12498 2 28 67 3
15 2 47559680 552064 711936 0 2332 1111 12197 2 30 63 6
10 3 47539648 547808 737344 0 5012 1174 12683 2 28 63 8
13 4 47585600 548736 689728 0 1616 1173 12393 2 31 58 8
17 2 47575680 550432 699264 0 2252 1224 12135 2 35 54 8
31 2 47643008 550944 631712 0 2216 1189 4795 2 82 15 2
28 1 47724288 551296 548320 0 2532 1178 4297 2 77 18 4
25 2 47798464 552032 473824 0 2724 1199 3283 2 73 22 3
12 5 48026944 552096 243296 0 2272 1170 4389 2 54 37 7
0 9 48201344 552160 69696 0 3480 1167 466 0 8 62 29
1 4 48206720 552160 64512 0 3252 1173 152 0 0 83 16
1 2 48210880 552160 60864 0 3232 1163 106 0 0 90 9
2 2 48210880 552160 60864 0 3592 1163 111 0 0 93 6
1 8 48256320 552160 36928 0 3008 1146 587 0 2 79 20
2 7 48264128 552160 30016 0 3488 1153 170 0 0 76 24
2 6 48268544 552160 26912 0 3012 1151 145 0 0 79 21
2 5 48273408 552160 22400 0 312 1162 116 0 0 83 16
4 0 48277248 552160 21056 12 8 1051 184 0 1 97 1
0 0 48280448 552160 21280 0 0 1033 59 0 0 100 0

2003-03-15 05:39:50

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

On Fri, Mar 14, 2003 at 09:43:38PM -0800, Martin J. Bligh wrote:
> You'd also have to stop sending all your IO over a NUMA backplane ...

Oh yes, there is also that.


At some point in the past, I wrote:
>> This locking issue may just need more cpus to bring out.

On Fri, Mar 14, 2003 at 09:43:38PM -0800, Martin J. Bligh wrote:
> More than 32 CPUs? Hmmmm.

More than 4.


-- wli

2003-03-15 06:10:05

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> > `dbench 512' will presumably do lots of IO and spend significant
> > time in I/O wait. You should see the effects of this change more
> > if you use fewer clients (say, 32) so it doesn't hit disk.
>
On Fri, Mar 14, 2003 at 09:49:10PM -0800, William Lee Irwin III wrote:
> Throughput 226.57 MB/sec 32 procs
> dbench 32 2>& 1 25.04s user 515.02s system 1069% cpu 50.516 total

It's too light a load... here's dbench 128.

Looks like dbench doesn't scale. It needs to learn how to spread itself
across disks if it's not to saturate a device queue while at the same
time generating enough cpu load to saturate cpus.

Is there a better (publicable/open/whatever) benchmark?

dbench 128:
Throughput 161.237 MB/sec 128 procs
dbench 128 2>& 1 143.85s user 3311.10s system 1219% cpu 4:43.27 total

vma samples %-age symbol name
c0106ff4 9134179 33.7261 default_idle
c01dc3b0 5570229 20.5669 __copy_to_user_ll
c01dc418 1773600 6.54865 __copy_from_user_ll
c0119058 731524 2.701 try_to_wake_up
c0108140 686952 2.53643 .text.lock.semaphore
c011a1bc 489415 1.80706 schedule
c0119dac 485196 1.79149 scheduler_tick
c011fadc 448048 1.65433 profile_hook
c0119860 356065 1.3147 load_balance
c0107d0c 267333 0.987072 __down
c011c4ff 249627 0.921696 .text.lock.sched
c0152ab0 223897 0.826694 __find_get_block_slow
c01dc510 222598 0.821897 atomic_dec_and_lock
c0168aac 218153 0.805485 d_lookup
c013ece4 194326 0.717509 check_highmem_ptes
c015f3dc 193112 0.713026 path_lookup
c01522a0 187115 0.690884 file_move
c010f6b8 166809 0.615908 timer_interrupt

procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
0 0 0 48280256 552768 23744 0 0 0 0 1027 16 0 0 100 0
1 0 0 48246272 552768 24160 0 0 0 108 1027 321 0 3 97 0
12 1 0 48194752 545152 24160 0 0 0 40 1054 664 0 8 91 0
14 0 0 48061696 548672 115328 0 0 0 0 1029 1922 2 33 64 0
42 0 0 47821824 547360 366240 0 0 0 0 1026 1255 1 75 22 2
63 1 0 47603392 546624 589760 0 0 0 0 1027 1270 1 98 1 0
60 0 0 47338368 551328 853056 0 0 0 8 1027 2193 1 96 2 0
61 0 0 47074496 551680 1117952 0 0 0 0 1034 2147 1 97 2 0
11 1 0 46781376 553184 1409472 0 0 0 0 1033 5128 1 80 18 1
35 0 0 46492224 552320 1696128 0 0 116 0 1059 7890 2 59 38 1
19 0 0 46295104 554304 1890112 0 0 28 0 1031 9004 2 52 45 1
14 0 0 46097728 558848 2086368 0 0 24 0 1033 8317 2 56 40 2
22 1 0 45849344 556288 2342304 0 0 20 32 1043 8267 2 55 43 1
26 1 0 45608576 558784 2579936 0 0 20 0 1032 7990 2 50 47 1
26 0 0 45421824 557184 2767040 0 0 16 0 1032 9670 2 41 55 2
21 1 0 45297408 557696 2889696 0 0 24 0 1034 8997 2 50 47 1
21 0 0 45254528 560992 2925760 0 0 4 0 1028 9363 2 47 50 0
35 0 0 45245120 556992 2938944 0 0 4 632 1097 7463 3 59 38 1
17 0 0 45247744 560768 2931456 0 0 0 0 1119 8538 3 52 45 0
15 1 0 45269376 556416 2913952 0 0 0 624 1056 9081 2 45 52 0
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
17 0 0 45296128 559136 2884576 0 0 4 12 1152 9749 2 44 54 0
24 0 0 45296832 562560 2880832 0 0 0 0 1029 8585 2 57 41 0
22 0 0 45274496 559072 2907360 0 0 0 588 1172 7348 2 60 38 0
19 1 0 45255616 555520 2929184 0 0 0 520 1027 7362 3 61 37 0
15 0 0 45245696 559552 2935264 0 0 0 52 1167 8931 3 50 47 0
10 0 0 45241152 563232 2936032 0 0 0 0 1027 9644 2 45 53 0
18 0 0 45273216 556800 2910176 0 0 0 416 1133 9657 2 49 48 0
16 0 0 45268288 559776 2912800 0 0 12 0 1030 9502 2 45 53 0
40 0 0 45253312 562368 2925664 0 0 0 0 1029 7597 3 62 35 0
11 1 0 45233408 562528 2945280 0 0 0 912 1118 8105 3 55 41 1
30 0 0 45251136 560192 2929888 0 0 0 104 1183 8715 3 50 47 0
11 0 0 45264768 562368 2913344 0 0 0 0 1025 8622 2 53 45 0
30 0 0 45296000 564128 2880928 0 0 0 160 1067 9565 2 48 50 0
20 0 0 45296192 559776 2886176 0 0 0 620 1173 8638 2 53 45 0
31 0 0 45267072 562400 2912416 0 0 0 0 1023 7383 3 63 34 0
17 1 0 45261184 558560 2921184 0 0 0 584 1043 8113 2 56 42 0
22 0 0 45239040 561984 2941216 0 0 0 56 1169 9078 2 48 49 0
17 1 0 45246528 564192 2928800 0 0 0 18076 1067 9885 2 46 50 1
24 2 0 45263808 560864 2915840 0 0 0 3912 1077 10085 2 47 48 3
25 3 0 45250944 563456 2927744 0 0 4 164 1063 8361 2 54 40 4
24 3 0 45247616 560448 2933728 0 0 0 600 1163 8387 3 58 35 5
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
23 2 0 45263744 560928 2917184 0 0 0 1792 1225 9299 2 52 42 4
21 1 0 45259264 562816 2920864 0 0 0 2052 1201 8903 3 49 45 3
24 1 0 45282432 564192 2895968 0 0 0 2220 1205 9228 2 49 47 2
17 2 0 45289408 560640 2892832 0 0 0 1276 1181 9552 2 46 49 3
31 1 0 45281344 558304 2903456 0 0 0 1332 1244 8814 2 53 41 3
23 1 0 45267456 560192 2914784 0 0 0 2164 1199 8500 2 55 41 1
20 1 0 45257536 562880 2922336 0 0 0 2212 1193 9535 2 50 46 2
23 1 0 45257152 565728 2919712 0 0 0 4524 1180 9578 2 45 51 2
11 2 0 45265408 563200 2914144 0 0 0 1484 1136 9154 2 50 45 3
22 1 0 45273344 560160 2910048 0 0 0 1540 1196 8949 2 49 46 3
21 1 0 45269632 561600 2910496 0 0 0 4840 1130 8013 3 58 38 2
30 1 0 45280960 563328 2897856 0 0 0 4292 1113 7722 3 62 34 2
14 1 0 45264064 565056 2913664 0 0 0 3492 1129 9123 2 54 42 2
26 2 0 45289792 562432 2890816 0 0 0 6028 1109 8671 2 57 39 2
14 2 0 45301504 561600 2880640 0 0 0 364 1117 9178 2 49 44 4
17 1 0 45286976 561472 2895744 0 0 0 6884 1175 8299 2 55 41 2
15 1 0 45270528 563040 2910240 0 0 0 4504 1090 8066 2 56 40 1
24 1 0 45261952 564704 2916800 0 0 0 2980 1102 8734 2 50 46 1
14 2 0 45261760 561920 2919968 0 0 0 5524 1122 9604 3 48 47 2
28 2 0 45269056 563136 2911488 0 0 0 328 1107 9034 3 51 42 4
14 1 0 45294080 560128 2889824 0 0 0 3756 1221 9055 2 50 45 2
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
28 1 0 45282944 561472 2899712 0 0 0 3556 1120 8221 3 57 39 2
23 1 0 45289856 562912 2890336 0 0 0 3176 1102 8270 3 56 40 1
27 1 0 45281984 564448 2896832 0 0 0 4364 1133 8721 3 53 42 2
22 1 0 45288576 565792 2889280 0 0 0 3756 1112 9156 3 50 45 2
20 2 0 45307968 563648 2872768 0 0 0 2564 1131 9414 2 48 47 2
20 2 0 45293376 563104 2886560 0 0 0 576 1166 8840 2 52 42 3
17 1 0 45287680 561824 2895616 0 0 0 3548 1202 8016 3 59 36 2
14 1 0 45272960 563584 2907584 0 0 0 3276 1139 8910 3 52 44 2
20 1 0 45265408 565088 2914784 0 0 0 3492 1135 9168 3 49 47 2
15 1 0 45278848 566560 2899296 0 0 0 3968 1170 9189 2 49 46 2
13 1 0 45291328 567360 2886592 0 0 0 3752 1125 9027 3 49 47 2
19 2 0 45297408 562944 2884864 0 0 0 2148 1126 8020 3 55 40 2
22 2 0 45298048 562944 2884544 0 0 0 492 1151 7937 3 56 38 3
25 1 0 45292352 562304 2891232 0 0 0 3948 1191 8045 3 54 42 2
22 1 0 45292480 563616 2888576 0 0 0 3620 1125 8714 2 52 43 2
17 1 0 45305344 564896 2874976 0 0 0 3084 1132 9214 3 49 47 2
13 1 0 45301760 565792 2877824 0 0 0 6672 1148 9488 2 46 50 2
20 1 0 45292160 566944 2884992 0 0 0 2368 1105 8931 2 53 44 1
22 2 0 45279552 564736 2899776 0 0 0 2820 1137 8201 3 59 36 2
11 2 0 45270976 564608 2910240 0 0 0 484 1150 9030 3 53 41 3
15 1 0 45297856 562432 2884544 0 0 0 4060 1176 8291 2 52 43 2
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
24 1 0 45302144 563200 2880032 0 0 0 3496 1139 9189 3 51 45 2
29 1 0 45310592 564224 2871296 0 0 0 4328 1145 8822 3 53 43 2
17 1 0 45323136 564896 2857600 0 0 0 4072 1131 7628 2 53 43 2
28 1 0 45312576 566240 2867328 0 0 0 3468 1150 8090 2 56 40 1
16 1 0 45299200 567360 2879328 0 0 0 2344 1156 8063 3 56 40 2
15 1 0 45315968 568640 2860768 0 0 0 2400 1181 9660 2 47 49 2
21 2 0 45321536 565248 2858944 0 0 0 1228 1197 9757 2 48 47 3
18 1 0 45308352 562656 2875168 0 0 0 1892 1230 8413 2 57 38 3
20 1 0 45300992 563584 2881216 0 0 0 4184 1174 8691 2 53 43 2
21 1 0 45289216 564640 2890720 0 0 0 3696 1138 8479 2 53 43 2
14 1 0 45287424 565472 2891872 0 0 0 4108 1146 8578 3 54 41 2
23 1 0 45309376 566240 2869152 0 0 0 2528 1124 9247 2 48 48 2
18 1 0 45314560 566848 2865312 0 0 0 3044 1126 9376 3 51 45 2
19 1 0 45300672 567968 2876000 0 0 0 2788 1127 9059 3 53 43 2
16 1 0 45320576 568736 2856672 0 0 0 3528 1139 8816 3 56 40 2
18 2 0 45314368 564992 2866784 0 0 0 1632 1138 8406 3 55 39 3
23 2 0 45298752 563584 2883552 0 0 0 792 1193 9421 3 47 47 4
17 1 0 45326976 563008 2855712 0 0 0 3756 1180 9161 2 51 44 2
15 1 0 45318976 563872 2864192 0 0 0 2600 1125 9449 2 50 46 2
27 1 0 45297472 564800 2884512 0 0 0 3176 1134 8358 3 57 38 2
22 1 0 45303552 565600 2877024 0 0 0 3204 1150 9311 3 49 47 2
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
22 0 0 45291776 566368 2888288 0 0 0 3104 1138 8800 3 51 45 2
21 1 0 45308416 567008 2871072 0 0 0 3112 1135 9005 3 51 44 2
18 1 0 45315392 567808 2863200 0 0 0 3080 1129 9499 2 50 46 2
27 1 0 45305920 568352 2871200 0 0 0 3012 1137 9396 3 49 47 2
25 1 0 45321344 569120 2854624 0 0 0 2728 1115 8623 2 55 41 1
24 2 0 45313984 565632 2867424 0 0 0 2432 1147 8838 2 53 43 2
20 1 0 45320704 562368 2864160 0 0 0 1372 1142 9033 3 50 45 3
22 1 0 45339968 563296 2843296 0 0 0 3520 1145 9056 2 49 46 2
17 1 0 45339584 563808 2843840 0 0 0 3468 1137 8871 2 54 43 1
17 1 0 45323968 564640 2859008 0 0 0 4096 1150 8350 3 57 39 2
26 1 0 45317824 565376 2864512 0 0 0 3040 1123 8397 3 56 40 1
14 1 0 45321344 566112 2859872 0 0 0 3328 1132 9070 2 52 44 2
21 1 0 45324672 566816 2855712 0 0 0 3336 1143 7997 3 59 37 1
18 1 0 45336768 567456 2844032 0 0 0 3888 1119 8689 2 55 41 2
22 1 0 45322432 567904 2858272 0 0 0 3816 1131 8632 3 51 44 2
25 1 0 45321280 568672 2857632 0 0 0 3012 1120 8792 2 56 40 2
19 1 0 45327040 569536 2851296 0 0 0 2180 1103 9192 2 51 45 1
18 1 0 45312576 570432 2863104 0 0 0 3308 1134 9535 3 50 46 2
15 2 0 45329088 566624 2850976 0 0 0 580 1127 9666 2 47 47 4
16 1 0 45335296 564064 2848288 0 0 0 3612 1210 9346 2 47 48 2
17 1 0 45331968 564640 2851008 0 0 0 3204 1129 9596 3 48 47 2
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
11 1 0 45330304 565376 2851520 0 0 0 3616 1123 9132 3 53 42 2
22 1 0 45331520 565984 2849888 0 0 0 3660 1137 9305 3 50 46 2
17 1 0 45332544 566592 2847648 0 0 0 3464 1123 8743 2 53 43 1
22 1 0 45332128 567104 2848992 0 0 0 2520 1112 8055 2 57 39 1
17 1 0 45322688 567584 2856480 0 0 0 3624 1124 9194 2 50 46 1
13 1 0 45326976 568224 2852128 0 0 0 2464 1127 9158 2 51 45 2
15 1 0 45320576 568736 2858368 0 0 0 2128 1155 9505 2 46 50 2
19 1 0 45318208 569312 2860160 0 0 0 2704 1174 9129 2 50 46 2
13 1 0 45325056 570016 2853216 0 0 0 2188 1142 9297 2 51 44 2
21 1 0 45337536 570400 2838048 0 0 0 3436 1177 8373 3 53 43 2
22 2 0 45334464 565696 2847584 0 0 0 1044 1128 8705 2 55 40 2
17 1 0 45358912 565024 2824736 0 0 0 2892 1219 8876 3 56 39 2
18 1 0 45357696 565664 2824192 0 0 0 3524 1154 8886 2 50 47 1
22 1 0 45355328 566176 2826816 0 0 0 3812 1152 9006 2 54 42 2
20 1 0 45354816 566720 2827040 0 0 0 3200 1148 9004 3 50 46 2
16 1 0 45349696 566976 2831360 0 0 0 3160 1150 8772 3 51 44 2
26 1 0 45360256 567424 2819648 0 0 0 3296 1145 8908 2 49 46 2
16 1 0 45352640 567776 2827968 0 0 0 4416 1151 9085 3 51 44 2
20 1 0 45337408 568480 2842624 0 0 0 4556 1165 8596 3 54 42 1
12 1 0 45337984 568928 2840800 0 0 0 3816 1158 9757 3 45 51 2
18 1 0 45348800 569472 2829696 0 0 0 4104 1166 9528 3 48 48 2
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
29 1 0 45358464 569760 2820192 0 0 0 4432 1152 8817 2 56 40 1
22 1 0 45346176 570240 2830336 0 0 0 2560 1122 8296 2 55 40 2
21 1 0 45351488 570464 2826144 0 0 0 3528 1145 9093 3 54 42 2
22 1 0 45349312 570752 2828160 0 0 0 3724 1149 8422 2 53 43 2
23 1 0 45347904 571040 2829856 0 0 0 3340 1138 9162 2 50 45 2
9 2 0 45354304 564960 2828288 0 0 0 1652 1150 9398 2 48 46 3
23 1 0 45358720 564928 2824608 0 0 0 2320 1183 9687 2 46 49 2
22 1 0 45356224 565536 2827232 0 0 0 3424 1155 8763 2 52 44 2
13 1 0 45345152 565984 2837472 0 0 0 2760 1132 8306 2 54 42 2
13 1 0 45344192 566400 2837792 0 0 0 4292 1168 9164 2 49 47 2
17 1 0 45351872 566816 2829088 0 0 0 3556 1143 9530 2 47 49 2
19 1 0 45352192 567104 2829248 0 0 0 3116 1139 9016 2 50 46 2
14 1 0 45360000 567680 2820096 0 0 0 4560 1161 9633 2 48 48 2
25 1 0 45352896 568160 2826720 0 0 0 3272 1147 8597 2 57 39 2
20 0 0 45347264 568448 2831968 0 0 0 3236 1134 8287 3 56 39 2
16 1 0 45341184 568928 2836224 0 0 0 3408 1138 9057 2 52 44 2
12 1 0 45347584 569280 2831168 0 0 0 2724 1122 9587 2 47 49 2
19 1 0 45354240 569440 2824832 0 0 0 3332 1134 9334 2 46 49 2
16 1 0 45357120 569760 2821568 0 0 0 2884 1137 8936 2 50 45 2
26 1 0 45346176 570176 2832544 0 0 0 3236 1130 9035 2 50 46 2
14 2 0 45339072 570560 2838272 0 0 8 3016 1136 8522 3 52 43 2
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
21 2 0 45357632 570848 2819616 0 0 0 3552 1154 9181 3 48 46 3
18 3 0 45363008 565632 2819168 0 0 8 2064 1146 9593 2 47 47 4
20 2 0 45404800 563776 2779744 0 0 8 2740 1197 9447 2 50 43 4
17 2 0 45404096 564032 2779968 0 0 0 4276 1156 8744 2 53 41 3
19 2 0 45394304 564320 2789728 0 0 0 2796 1127 8453 2 54 40 4
21 2 0 45385536 564864 2798336 0 0 4 2668 1160 8473 3 56 37 4
22 2 0 45397696 566048 2784160 0 0 24 3208 1204 9178 2 51 44 3
17 3 0 45403392 566912 2777632 0 0 8 2564 1156 9782 2 47 45 6
20 3 0 45401024 567488 2780672 0 0 16 3136 1157 8877 2 58 35 4
29 3 0 45408192 568512 2772256 0 0 8 2860 1141 9097 2 56 37 5
19 4 0 45387456 569504 2792192 0 0 4 4100 1149 8595 3 57 34 7
18 4 0 45390592 570336 2787872 0 0 20 3472 1143 8755 3 52 38 7
16 5 0 45408000 571456 2769280 0 0 16 3748 1167 10441 2 46 43 9
21 4 0 45423104 567584 2757696 0 0 48 1036 1193 10248 2 48 41 9
16 5 0 45427264 565344 2755648 0 0 32 2620 1236 9347 2 54 34 10
16 5 0 45417856 567456 2763296 0 0 72 3556 1160 9557 2 51 37 9
25 6 0 45427776 569440 2751616 0 0 60 3112 1151 10403 2 45 42 10
15 4 0 45438720 571072 2738720 0 0 48 3500 1173 9942 3 50 37 10
13 10 0 45457024 568320 2723392 0 0 96 1728 1160 10681 2 48 37 13
17 8 0 45454208 565952 2728416 0 0 24 1940 1224 10121 3 50 33 14
32 8 0 45460352 568416 2718848 0 0 56 2980 1148 10187 3 54 31 13
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
19 8 0 45449664 570048 2728992 0 0 48 3268 1160 9737 3 53 32 12
41 12 0 45494208 571392 2683040 0 0 88 2628 1179 10545 2 55 30 13
35 15 0 45536448 569280 2642528 0 0 148 1480 1136 9438 2 63 21 14
83 21 0 45558784 569728 2619680 0 0 160 848 1245 6599 2 79 9 9
20 21 0 45604736 566208 2576128 0 0 376 1216 1224 10129 2 65 17 16
30 29 0 45667328 570496 2512064 0 0 604 1668 1193 14999 2 54 19 25
24 31 0 45726656 572928 2449472 0 0 544 484 1209 16814 2 51 20 26
69 41 0 45830272 576576 2341088 0 0 604 320 1333 22979 2 58 15 25
42 62 0 45950688 566944 2228224 0 0 892 340 1464 27057 2 66 10 22
23 27 0 46028832 570656 2147328 0 0 424 304 1244 18954 2 78 5 15
28 107 0 46159200 566496 2022560 0 0 808 352 1228 23538 1 71 4 24
7 116 0 46224160 570016 1960864 0 0 600 224 1230 27775 0 13 5 82
2 119 0 46272416 574432 1909504 0 0 560 288 1239 21885 0 7 4 88
15 109 0 46330272 564256 1862912 0 0 488 968 1232 18866 0 7 3 90
5 114 0 46375584 568128 1816160 0 0 500 1216 1199 19986 0 7 1 91
2 114 0 46423456 560352 1777440 0 0 452 1248 1202 16782 0 6 2 92
2 113 0 46455392 563488 1742176 0 0 396 1768 1193 15011 0 5 3 93
6 111 0 46492320 566752 1702944 0 0 420 1464 1185 16749 0 5 0 94
4 112 0 46532512 570688 1659200 0 0 492 1152 1188 18447 0 6 1 92
4 113 0 46582304 562080 1617984 0 0 400 1344 1187 15024 0 5 2 93
2 113 0 46625312 565440 1571648 0 0 436 1568 1196 18994 0 7 1 92
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
6 109 0 46647712 567616 1548000 0 0 288 1956 1182 10422 0 4 3 93
4 108 0 46707872 559744 1499008 0 0 480 1200 1202 20386 0 8 3 89
9 103 0 46752672 563168 1452544 0 0 444 1360 1195 18791 0 6 3 90
6 105 0 46792864 566240 1409216 0 0 412 1720 1186 17387 0 6 3 90
2 75 0 46821152 568384 1381088 0 0 280 1156 1191 10829 0 4 4 92
2 66 0 46874784 559872 1336128 0 0 432 1240 1198 9432 0 4 4 92
3 53 0 46912480 563264 1294912 0 0 436 1360 1204 9480 0 4 3 93
1 27 0 46918112 563648 1289056 0 0 52 2000 1154 1342 0 1 21 78
1 26 0 46919776 563744 1287392 0 0 20 3380 1154 484 0 1 37 62
2 25 0 46921120 563936 1285664 0 0 28 2620 1133 422 0 1 37 62
4 24 0 46928992 564576 1277216 0 0 80 2508 1142 929 0 1 38 61
2 25 0 46935968 564832 1270304 0 0 40 2856 1147 912 0 1 37 62
4 97 0 46986784 568736 1221696 0 0 520 2720 1169 11680 0 6 7 87
2 97 0 47047712 560480 1171264 0 0 444 1456 1193 17112 0 7 6 88
2 92 0 47094304 563168 1125856 0 0 368 1444 1180 15707 0 6 3 91
2 64 0 47138720 566048 1079008 0 0 376 1320 1192 14764 0 5 4 90
2 53 0 47160352 567360 1055904 0 0 176 2248 1165 4756 0 2 13 85
1 52 0 47162912 567456 1053280 0 0 16 2768 1143 889 0 1 12 87
7 81 0 47206816 569792 1016704 0 0 304 2288 1158 8250 0 4 11 84
2 80 0 47276128 561120 958400 0 0 432 1488 1198 17237 0 6 6 88
8 76 0 47331552 564320 902144 0 0 440 1252 1198 15313 0 6 6 88
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
1 72 0 47406880 567840 829184 0 0 456 1676 1204 18356 0 7 6 87
1 70 0 47457568 570624 778240 0 0 384 1508 1179 12199 0 4 3 92
8 61 0 47532000 573312 706496 0 0 348 1704 1184 14745 0 5 9 85
0 60 0 47596000 564896 655776 0 0 384 1516 1185 9453 0 4 10 86
2 57 0 47636640 568160 614112 0 0 416 1512 1189 7620 0 3 8 89
1 54 0 47689184 570016 564096 0 0 256 1996 1168 8807 0 4 7 90
1 54 0 47689632 570016 563520 0 0 0 2656 1129 548 0 0 9 90
1 52 0 47699616 570528 554336 0 0 64 200 1149 2042 0 1 13 86
0 49 0 47758112 572960 497216 0 0 312 4744 1201 9579 0 4 13 83
1 48 0 47768736 573280 487296 0 0 40 2204 1139 1964 0 1 16 83
1 48 0 47772384 573440 483392 0 0 24 3040 1142 1150 0 1 16 83
0 48 0 47799840 565632 463776 0 0 108 2156 1140 3420 0 2 15 83
1 48 0 47803616 565888 459552 0 0 32 4440 1157 1138 0 1 12 87
3 48 0 47852064 567776 409376 0 0 244 3280 1172 7173 0 3 9 88
1 41 0 47889568 568640 377888 0 0 108 4608 1160 4626 0 2 13 84
1 41 0 47896224 568832 371520 0 0 24 3712 1125 1348 0 1 22 77
1 41 0 47905632 569088 361600 0 0 32 2156 1152 1743 0 1 22 77
1 38 0 47929696 569888 338560 0 0 100 2836 1143 3432 0 2 21 77
0 39 0 47937760 569984 330560 0 0 16 3228 1136 1328 0 1 22 77
2 36 0 47944608 570080 326784 0 0 16 3560 1129 684 0 1 24 75
2 36 0 47974048 570688 297024 0 0 76 2560 1119 3580 0 2 26 71
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
1 35 0 47991392 571136 280256 0 0 56 2908 1122 2047 0 1 27 72
3 35 0 47994272 571168 277568 0 0 4 3676 1112 614 0 1 28 71
0 35 0 48006944 571648 264480 0 0 60 2252 1124 1507 0 1 26 73
0 35 0 48041440 572224 229440 0 0 80 2676 1137 3767 0 2 21 77
2 35 0 48042528 572320 227968 0 0 52 2540 1189 703 0 2 24 73
0 34 0 48061088 572544 209664 0 0 28 4124 1123 1935 0 1 24 75
0 29 0 48089696 573152 185568 0 0 76 3720 1132 2361 0 1 25 74
1 28 0 48097824 573248 178816 0 0 12 3616 1130 945 0 0 31 69
3 25 0 48105056 573408 174688 0 0 20 3112 1130 628 0 0 35 64
1 25 0 48126688 573824 152736 0 0 60 2400 1136 1972 0 1 40 59
0 25 0 48132000 574016 147616 0 0 28 2704 1126 516 0 0 36 63
0 24 0 48143136 574112 136832 0 0 12 2724 1126 816 0 0 35 65
0 23 0 48160480 574496 120800 0 0 48 4436 1192 1205 0 1 35 65
0 24 0 48155808 574752 118304 0 0 928 4848 1205 442 0 0 38 61
0 23 0 48163744 574816 117216 0 0 44 3108 1135 285 3 0 38 60
0 21 0 48192864 565920 99200 0 0 40 2368 1114 625 0 0 39 61
1 21 0 48197600 566048 94304 0 0 20 3212 1126 416 0 0 41 59
1 14 0 48206176 566048 92192 0 0 0 3244 1130 312 0 0 58 42
2 11 0 48217504 566080 83648 0 0 4 2536 1125 532 0 0 62 38
1 11 0 48241568 566592 59296 0 0 64 3300 1143 963 0 1 68 31
0 11 0 48242016 566624 58880 0 0 4 2788 1126 173 0 0 69 31
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
0 11 0 48242016 566624 58880 0 0 0 3648 1142 156 0 0 69 31
1 10 0 48253856 566944 46720 0 0 40 3644 1173 423 0 0 69 31
0 11 0 48256992 567040 43232 0 0 12 2644 1165 219 0 0 69 31
2 11 0 48261472 567136 38720 0 0 12 3048 1165 226 0 0 69 31
1 6 0 48268192 567200 35776 0 0 8 3608 1169 138 0 0 73 27
1 0 0 48288032 567552 22976 0 0 44 0 1132 142 0 0 88 12
0 0 0 48288288 567552 22976 0 0 0 180 1061 23 0 0 100 0
1 0 0 48288288 567552 22976 0 0 0 0 1026 21 0 0 100 0
0 0 0 48288096 567552 22976 0 0 0 0 1027 43 0 0 100 0

2003-03-15 06:34:07

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

William Lee Irwin III <[email protected]> wrote:
>
> On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> > > `dbench 512' will presumably do lots of IO and spend significant
> > > time in I/O wait. You should see the effects of this change more
> > > if you use fewer clients (say, 32) so it doesn't hit disk.
> >
> On Fri, Mar 14, 2003 at 09:49:10PM -0800, William Lee Irwin III wrote:
> > Throughput 226.57 MB/sec 32 procs
> > dbench 32 2>& 1 25.04s user 515.02s system 1069% cpu 50.516 total
>
> It's too light a load... here's dbench 128.

OK.

> Looks like dbench doesn't scale. It needs to learn how to spread itself
> across disks if it's not to saturate a device queue while at the same
> time generating enough cpu load to saturate cpus.

Nope. What we're trying to measure here is pure in-memory lock contention,
locked bus traffic, context switches, etc, etc. To do that we need to get
the IO system out of the picture.

One way to do that is to increase /proc/sys/vm/dirty_ratio and
dirty_background_ratio to 70% or so. You can still hit IO wait if someone
tries to truncate a file which pdflush is writing out, so increase
dirty_expire_centisecs and dirty_writeback_centisecs to 1000000000 or so...

Then, on the second run, when all the required metadata blocks are in
pagecache you should be able to get an IO-free run.

> Is there a better (publicable/open/whatever) benchmark?

I have lots of little testlets which can be mixed and matched. RAM-only
dbench will do for the while. It is showing things.

>
> dbench 128:
> Throughput 161.237 MB/sec 128 procs
> dbench 128 2>& 1 143.85s user 3311.10s system 1219% cpu 4:43.27 total
>
> vma samples %-age symbol name
> c0106ff4 9134179 33.7261 default_idle
> c01dc3b0 5570229 20.5669 __copy_to_user_ll
> c01dc418 1773600 6.54865 __copy_from_user_ll
> c0119058 731524 2.701 try_to_wake_up
> c0108140 686952 2.53643 .text.lock.semaphore
> c011a1bc 489415 1.80706 schedule
> c0119dac 485196 1.79149 scheduler_tick
> c011fadc 448048 1.65433 profile_hook
> c0119860 356065 1.3147 load_balance
> c0107d0c 267333 0.987072 __down
> c011c4ff 249627 0.921696 .text.lock.sched

The wakeup and .text.lock.semaphore load indicates that there is a lot
of contention for a semaphore somewhere. Still.

I'm not sure which one. It shouldn't be a directory semaphore. Might be
lock_super() in the inode allocator, but that seems unlikely.


2003-03-15 06:54:45

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

William Lee Irwin III <[email protected]> wrote:
>> Looks like dbench doesn't scale. It needs to learn how to spread itself
>> across disks if it's not to saturate a device queue while at the same
>> time generating enough cpu load to saturate cpus.

On Fri, Mar 14, 2003 at 10:44:13PM -0800, Andrew Morton wrote:
> Nope. What we're trying to measure here is pure in-memory lock contention,
> locked bus traffic, context switches, etc, etc. To do that we need to get
> the IO system out of the picture.
> One way to do that is to increase /proc/sys/vm/dirty_ratio and
> dirty_background_ratio to 70% or so. You can still hit IO wait if someone
> tries to truncate a file which pdflush is writing out, so increase
> dirty_expire_centisecs and dirty_writeback_centisecs to 1000000000 or so...
> Then, on the second run, when all the required metadata blocks are in
> pagecache you should be able to get an IO-free run.

Oh, sorry, I did increase dirty_ratio and dirty_background_ratio to 99,
I forgot about dirty_writeback_centisecs though, I'll re-run with that.

William Lee Irwin III <[email protected]> wrote:
>> Is there a better (publicable/open/whatever) benchmark?

On Fri, Mar 14, 2003 at 10:44:13PM -0800, Andrew Morton wrote:
> I have lots of little testlets which can be mixed and matched. RAM-only
> dbench will do for the while. It is showing things.
>

William Lee Irwin III <[email protected]> wrote:
>> dbench 128:
>> Throughput 161.237 MB/sec 128 procs
>> dbench 128 2>& 1 143.85s user 3311.10s system 1219% cpu 4:43.27 total
>> vma samples %-age symbol name
>> c0106ff4 9134179 33.7261 default_idle
>> c01dc3b0 5570229 20.5669 __copy_to_user_ll
>> c01dc418 1773600 6.54865 __copy_from_user_ll
>> c0119058 731524 2.701 try_to_wake_up
>> c0108140 686952 2.53643 .text.lock.semaphore
>> c011a1bc 489415 1.80706 schedule
>> c0119dac 485196 1.79149 scheduler_tick
>> c011fadc 448048 1.65433 profile_hook
>> c0119860 356065 1.3147 load_balance
>> c0107d0c 267333 0.987072 __down
>> c011c4ff 249627 0.921696 .text.lock.sched

On Fri, Mar 14, 2003 at 10:44:13PM -0800, Andrew Morton wrote:
> The wakeup and .text.lock.semaphore load indicates that there is a lot
> of contention for a semaphore somewhere. Still.
> I'm not sure which one. It shouldn't be a directory semaphore. Might be
> lock_super() in the inode allocator, but that seems unlikely.

I'm going to have to break out tools to decipher which one this is.
hlinder forward-ported lockmeter so I'll throw that in the mix.


-- wli

2003-03-15 08:14:26

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

On Fri, Mar 14, 2003 at 10:44:13PM -0800, Andrew Morton wrote:
>> Nope. What we're trying to measure here is pure in-memory lock contention,
>> locked bus traffic, context switches, etc, etc. To do that we need to get
>> the IO system out of the picture.
>> One way to do that is to increase /proc/sys/vm/dirty_ratio and
>> dirty_background_ratio to 70% or so. You can still hit IO wait if someone
>> tries to truncate a file which pdflush is writing out, so increase
>> dirty_expire_centisecs and dirty_writeback_centisecs to 1000000000 or so...
>> Then, on the second run, when all the required metadata blocks are in
>> pagecache you should be able to get an IO-free run.
>
On Fri, Mar 14, 2003 at 11:05:11PM -0800, William Lee Irwin III wrote:
> Oh, sorry, I did increase dirty_ratio and dirty_background_ratio to 99,
> I forgot about dirty_writeback_centisecs though, I'll re-run with that.

Next pass involves lockmeter:

$ cat /proc/sys/vm/dirty_expire_centisecs
360000
$ cd /test/wli
$ ls
$ (time dbench 128) |& tee -a ~/dbench.output.log.7
zsh: correct '~/dbench.output.log.7' to '~/dbench.output.log.6' [nyae]? n
128 clients started
0 62477 206.65 MB/sec
Throughput 206.651 MB/sec 128 procs
dbench 128 143.50s user 3258.66s system 1574% cpu 3:36.04 total

vma samples %-age symbol name
c0106ff4 7617343 29.5286 default_idle
c01dc3b0 5212934 20.2079 __copy_to_user_ll
c01dc418 1806434 7.00263 __copy_from_user_ll
c0264bd0 1595815 6.18617 sync_buffer
c0108140 712115 2.76051 .text.lock.semaphore
c0119058 621494 2.40922 try_to_wake_up
c011a1bc 409622 1.5879 schedule
c0107d0c 278704 1.08039 __down
c011c4ff 263802 1.02263 .text.lock.sched
c0152ab0 260394 1.00942 __find_get_block_slow
c011fadc 247423 0.959134 profile_hook
c0264da0 231721 0.898265 add_event_entry
c0168aac 223276 0.865528 d_lookup
c01dc510 212968 0.825569 atomic_dec_and_lock
c0119dac 208443 0.808028 scheduler_tick
c015f3dc 192765 0.747253 path_lookup
c01522a0 191853 0.743717 file_move
c0119860 188927 0.732375 load_balance
c0122930 168016 0.651313 current_kernel_time
c010f6b8 166633 0.645952 timer_interrupt
c013ece4 160376 0.621697 check_highmem_ptes
c0133118 155858 0.604183 find_get_page


procs -----------memory---------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
0 0 48309664 568288 23360 0 0 1026 18 0 0 100 0
0 0 48274208 568288 23392 0 0 1028 256 0 1 99 0
91 0 48192928 569056 58272 0 0 1036 1790 1 20 79 0
49 0 47769920 580224 439808 0 0 1024 5071 2 94 4 0
70 0 47437248 582720 773952 0 0 1030 2366 1 95 5 0
56 1 47140576 586656 1062272 0 0 1022 1931 1 97 3 0
86 0 46901920 581568 1307168 0 0 1025 1800 0 98 2 0
55 0 46619744 584672 1585568 0 0 1028 2584 1 93 5 0
26 1 46343712 577600 1867360 0 0 1025 3387 1 87 12 1
20 0 46033184 575936 2176384 0 0 1028 5742 2 67 30 1
27 1 45834912 577056 2366304 0 0 1027 6072 2 66 31 1
24 1 45692128 577184 2504960 16 0 1029 8056 2 56 41 1
procs -----------memory---------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
18 0 45533280 581376 2659648 32 0 1037 8107 2 57 40 1
23 1 45386464 575264 2810592 32 0 1036 8227 2 54 42 2
4 4 45342240 577056 2852768 360 0 1112 3217 1 14 81 5
11 1 45283040 576160 2911712 140 0 1065 9003 1 26 71 2
8 1 45221856 580064 2968128 64 0 1043 11620 2 31 66 1
15 0 45181984 577024 3009120 92 0 1050 9451 2 40 57 1
10 0 45160224 579136 3027584 32 0 1038 9296 2 47 51 0
12 2 45198176 575200 2991840 4 1156 1205 9604 3 47 47 3
19 0 45284320 571872 2909440 0 72 1180 10532 2 42 54 1
21 0 45364064 574112 2827136 0 0 1028 9547 2 49 49 0
23 0 45368224 576384 2821504 0 0 1026 7902 3 57 41 0
22 0 45354720 571776 2839616 0 512 1127 7965 3 60 37 0
16 0 45315168 575072 2874272 0 0 1054 8367 3 59 39 0
19 0 45290592 578272 2898048 68 0 1043 8835 2 45 52 0
18 0 45244768 574432 2947424 8 640 1188 9696 2 45 51 2
18 0 45217184 576064 2973184 8 0 1028 10751 2 43 55 0
17 0 45192992 578080 2995040 24 0 1031 10860 2 40 58 0
21 0 45189408 572736 3004128 0 708 1201 8661 3 56 41 0
17 0 45219744 574112 2971392 8 0 1028 9531 3 46 51 0
26 0 45270176 575232 2919456 0 0 1028 9925 3 44 53 0
28 0 45332448 576672 2855968 0 0 1026 9073 3 50 47 0
procs -----------memory---------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
17 0 45365536 578496 2821440 4 0 1028 8403 3 53 44 0
14 0 45353056 572768 2840352 0 592 1173 7842 3 60 37 0
14 0 45307808 574400 2882176 0 0 1028 8073 3 60 38 0
20 0 45287776 576320 2902208 0 0 1028 8750 3 54 43 0
17 0 45245280 578240 2943424 4 0 1025 10229 2 44 54 0
19 0 45229408 571840 2965536 0 588 1173 9809 2 44 53 0
15 1 45210848 573664 2981216 12 0 1028 9992 2 46 52 0
18 0 45189024 574880 3002016 48 0 1040 8482 2 50 48 0
9 0 45216032 576032 2972800 0 0 1026 8795 3 53 44 0
12 0 45263968 577056 2923680 0 0 1027 10132 3 43 55 0
18 0 45312992 578080 2874432 0 0 1027 8791 3 51 46 0
20 1 45362336 575168 2828000 0 896 1118 8793 2 52 45 1
29 0 45345376 573760 2847296 0 76 1172 8315 3 56 41 0
30 0 45319840 575648 2871040 0 0 1032 7647 3 61 36 0
6 1 45289440 576864 2898432 12 0 1026 8163 2 53 44 0
16 0 45255520 578240 2932256 12 0 1030 9372 2 44 54 0
16 0 45228000 572800 2965376 0 1016 1227 10052 2 44 52 1
15 0 45216736 573920 2974880 0 0 1080 10110 2 46 51 0
21 0 45200672 574912 2990528 0 0 1025 8989 2 54 44 0
24 0 45214304 575776 2974880 0 0 1027 9274 3 50 47 0
20 0 45263712 576608 2923520 0 0 1026 8689 3 50 48 0
procs -----------memory---------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
22 0 45320800 577440 2867328 0 0 1027 9487 2 47 50 0
19 0 45344416 571488 2849888 0 632 1079 8074 3 58 39 0
23 0 45344096 573184 2848384 0 0 1132 8043 3 57 40 0
25 0 45324960 574464 2866784 0 0 1026 7457 3 64 33 0
17 0 45296992 575808 2893440 0 0 1027 9124 3 51 46 0
19 0 45265952 576992 2923648 0 0 1025 9579 2 50 48 0
23 0 45237280 577728 2949344 0 0 1030 10051 2 48 50 0
15 0 45208800 578912 2978656 0 0 1026 10072 2 47 51 0
19 0 45211552 568832 2985696 0 176 1070 8537 3 55 42 0
20 0 45215328 569792 2980736 0 0 1027 9434 3 47 50 0
12 0 45245408 570432 2949472 0 0 1026 9018 3 51 46 0
19 0 45317600 570816 2875904 0 0 1025 9767 3 47 50 0
13 0 45351712 571616 2842272 0 0 1030 9012 2 51 46 0
23 0 45342752 572640 2851232 0 0 1027 8134 3 59 38 0
23 0 45332832 573536 2859616 0 0 1026 8081 3 57 40 0
15 0 45304992 574816 2886560 0 0 1024 8929 2 54 44 0
19 0 45270752 575712 2919776 0 0 1027 9503 2 49 49 0
16 0 45230560 576896 2958208 0 0 1031 10190 2 44 54 0
22 0 45205344 577664 2981312 0 0 1024 9903 2 45 53 0
16 0 45206688 578720 2981024 4 0 1028 8860 2 53 44 0
20 0 45218080 572128 2976416 0 988 1168 9526 3 50 47 1
procs -----------memory---------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
20 0 45242400 572640 2950144 0 0 1133 8446 3 52 45 0
21 0 45312608 573024 2879936 0 0 1026 9444 3 48 49 0
22 0 45347104 573664 2844576 0 0 1026 8102 3 57 40 0
15 0 45344352 574368 2847584 0 0 1027 8310 3 57 40 0
27 0 45334304 575008 2856736 0 0 1026 8763 3 52 45 0
26 0 45303328 575712 2887648 0 0 1026 9057 2 53 44 0
17 0 45269984 576704 2920704 0 0 1026 9975 2 45 52 0
20 0 45222880 578240 2966080 0 0 1027 10239 2 44 53 0
23 0 45209760 573120 2983168 4 852 1138 10067 2 45 52 1
25 0 45197792 573856 2995200 0 0 1136 8914 3 51 46 0
17 0 45216288 574208 2974976 0 0 1017 9714 2 47 51 0
15 0 45255712 574720 2935488 0 0 1026 9560 3 48 49 0
20 0 45311072 575360 2879296 0 0 1027 8887 3 55 42 0
29 0 45339360 575904 2851072 0 0 1031 7836 3 61 36 0
24 0 45341280 576480 2849056 0 0 1022 7930 3 57 40 0
27 0 45330976 577504 2858400 0 0 1025 7902 3 60 37 0
25 0 45311264 578112 2877728 0 0 1027 8213 3 56 42 0
17 0 45271712 579264 2916256 20 0 1032 9453 2 47 51 0
21 0 45236064 573632 2957344 0 864 1240 10165 2 45 52 1
21 0 45214752 574336 2977152 0 0 1026 10164 2 43 54 0
28 0 45212512 574976 2979936 0 0 1026 9132 3 52 45 0
procs -----------memory--------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
22 0 45204512 575360 2985536 0 0 1028 9087 3 51 46 0
16 0 45234976 576096 2954816 0 0 1028 9246 3 52 46 0
28 0 45284640 576608 2903360 0 0 1028 8713 3 55 43 0
30 0 45327328 577120 2860992 0 0 1026 8932 3 53 44 0
23 0 45342048 577504 2847104 0 0 1026 8421 3 55 43 0
26 0 45334656 578048 2854144 0 0 1025 7808 3 56 41 0
26 0 45315616 578720 2872352 0 0 1027 8527 3 53 45 0
18 1 45280800 574528 2911712 0 524 1028 8218 2 55 42 0
22 0 45246752 573792 2946784 0 416 1256 9411 3 45 51 1
11 0 45226464 574432 2965856 0 0 1026 9924 2 44 54 0
13 0 45214432 575296 2977056 0 0 1027 9667 2 47 51 0
24 0 45214048 575872 2976544 0 0 1026 8562 2 53 45 0
26 0 45216544 576288 2973792 0 0 1026 8803 3 48 49 0
28 0 45261472 576640 2928544 0 0 1043 9363 3 50 47 0
27 0 45304352 576928 2884896 0 0 1009 8179 3 52 45 0
27 0 45349408 577472 2839488 0 0 1028 8289 3 55 42 0
17 0 45342432 577792 2845600 0 0 1033 8466 3 56 41 0
35 0 45328672 578112 2858560 0 0 1022 8160 2 56 41 0
23 0 45290720 578464 2897344 0 0 1026 8669 2 51 47 0
17 0 45249824 579360 2938080 0 0 1027 8979 2 51 47 0
14 0 45234080 572896 2960544 0 784 1215 10500 2 43 54 1
procs -----------memory---------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
20 0 45218656 573344 2974624 0 0 1030 10204 2 44 54 0
35 0 45216480 573664 2976992 0 0 1049 8737 2 51 46 0
15 0 45230048 574048 2962240 0 0 1277 10935 3 52 45 0
19 0 45256928 574432 2934816 0 0 1012 8437 3 50 47 0
18 0 45306080 574720 2885344 0 0 1025 8968 3 51 46 0
11 0 45346016 574944 2846176 0 0 1026 8805 2 53 45 0
21 0 45350688 575200 2842144 0 0 1026 7943 3 57 40 0
13 0 45315552 575520 2876352 0 0 1026 8419 2 56 42 0
24 0 45292704 575872 2897824 0 0 1028 8356 2 55 43 0
23 0 45254176 576576 2936992 0 0 1029 9594 3 49 49 0
18 0 45229856 577056 2960640 0 0 1027 9921 2 46 52 0
17 0 45209632 577600 2980864 0 0 1025 9635 2 45 53 0
26 0 45221664 577888 2967232 0 0 1045 9195 2 51 47 0
16 0 45231264 578464 2957248 0 0 1264 10989 2 51 47 0
21 0 45253088 578784 2935552 0 0 1025 8312 3 55 43 0
22 0 45310240 579040 2876992 0 0 1028 8878 3 51 47 0
23 0 45346016 579392 2841696 0 0 1026 9040 3 49 48 0
19 0 45345696 579712 2842464 0 0 1026 8184 2 58 39 0
23 0 45324000 580064 2863296 0 0 1027 8464 3 53 45 0
21 0 45285024 574240 2909024 0 748 1157 8943 2 52 45 0
17 0 45251360 574848 2941888 0 0 1081 9679 3 48 50 0
procs -----------memory---------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
16 0 45233760 575296 2959616 0 0 1027 10785 2 43 55 0
21 0 45208032 575904 2983936 20 0 1034 9635 2 46 52 0
11 0 45212512 576576 2977248 12 0 1030 9593 2 48 49 0
13 0 45226848 576832 2963360 0 0 1026 9295 3 47 51 0
27 0 45264544 577088 2924704 0 0 1026 8856 3 51 46 0
17 0 45308768 577280 2880608 0 0 1027 9160 3 51 46 0
28 0 45335584 577536 2854528 0 0 1030 8855 3 52 46 0
25 0 45338336 577856 2850976 0 0 1027 7700 3 57 40 0
29 0 45326816 578048 2862496 0 0 1025 7957 3 56 42 0
33 0 45291168 578400 2897952 0 0 1028 8537 2 55 43 0
17 0 45252576 578880 2936192 0 0 1024 8634 2 53 44 0
17 0 45226912 579136 2960416 0 0 1026 9436 2 49 49 0
15 0 45206240 579392 2980928 0 0 1028 9701 2 51 47 0
26 0 45210976 579520 2978112 0 0 1026 9054 3 52 45 0
28 0 45227040 579712 2960576 0 0 1025 9191 3 47 50 0
25 0 45255328 579968 2932736 0 0 1027 8116 3 55 42 0
20 0 45301152 580096 2886624 0 0 1027 9113 3 52 45 0
23 0 45324128 572960 2870272 0 612 1129 7719 3 57 40 0
22 0 45341792 573216 2852512 0 0 1075 8123 3 54 43 0
23 0 45341728 573760 2851616 0 0 1027 8674 2 55 43 0
18 0 45306720 573984 2887360 0 0 1026 8670 2 55 43 0
procs -----------memory---------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
16 0 45282016 574176 2911232 0 0 1028 9431 3 50 48 0
17 0 45243040 574432 2951456 0 0 1030 9618 3 49 48 0
21 0 45218784 574656 2974240 0 0 1025 9529 2 49 49 0
21 0 45219296 574848 2971904 0 0 1027 9826 2 48 49 0
21 0 45234464 575008 2956288 0 0 1026 9718 3 48 50 0
17 0 45265376 575168 2927712 0 0 1028 8749 3 57 40 0
22 0 45278752 575232 2914176 0 0 1026 8613 3 55 42 0
31 0 45304480 575520 2887328 0 0 1025 7851 3 60 37 0
20 0 45329888 575808 2861280 0 0 1026 8646 3 54 43 0
24 0 45331872 575936 2859552 0 0 1027 8722 2 54 44 0
21 0 45305056 576192 2886976 0 0 1026 8383 2 58 40 0
24 0 45279136 576416 2912736 0 0 1027 8586 2 57 41 0
21 0 45248736 576896 2942272 0 0 1026 9470 2 51 47 0
14 0 45223392 577184 2967328 0 0 1026 10033 2 47 50 0
19 0 45219104 577376 2971392 0 0 1026 9856 2 49 49 0
17 0 45222432 577600 2967456 0 0 1027 9484 3 47 51 0
29 0 45261472 577632 2927264 0 0 1035 8997 3 54 43 0
25 0 45271008 577792 2917664 0 0 1021 8259 3 55 42 0
24 0 45291936 578016 2897632 0 0 1027 7985 3 57 40 0
22 0 45308960 578336 2880864 0 0 1027 9218 3 52 46 0
18 0 45328160 578560 2861120 0 0 1025 8907 2 51 47 0
procs -----------memory---------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
22 0 45304480 578784 2884768 0 0 1026 8257 2 52 45 0
36 0 45279776 578912 2909472 0 0 1027 8651 2 54 43 0
28 0 45254624 579104 2933280 0 0 1026 9417 3 49 48 0
22 0 45238688 579328 2949920 0 0 1031 8350 2 57 41 0
20 0 45216800 579552 2972384 0 0 1023 8923 2 51 47 0
22 0 45217504 579616 2970464 0 0 1026 9383 3 50 47 0
19 0 45249760 579808 2937920 0 0 1026 9535 2 47 50 0
21 1 45283680 580480 2902336 24 0 1032 9435 3 49 49 0
26 0 45316960 575424 2873696 52 800 1237 8920 3 57 37 4
18 2 45365152 579776 2822816 220 0 1079 8644 3 56 40 1
21 10 45406880 576064 2783904 216 1104 1219 9441 3 59 30 8
19 13 45499040 576800 2691584 184 252 1229 14054 2 52 31 15
22 26 45560736 578432 2627872 544 584 1204 11215 2 63 19 16
36 49 45656672 585472 2524128 616 288 1236 21734 2 61 14 23
34 47 45793248 580544 2391296 564 384 1251 26024 2 67 10 21
20 5 45992672 582432 2189312 272 896 1338 19731 2 75 9 15
16 107 46253536 578656 1932672 832 524 1157 41761 1 54 9 36
1 106 46416608 575776 1792800 472 1200 1212 37282 0 23 11 66
7 100 46468000 578784 1739456 392 1776 1201 20277 0 7 4 89
1 100 46538208 571104 1679872 524 16 1234 22199 0 8 3 88
0 99 46624480 578016 1587840 892 0 1255 32708 0 11 3 85
procs -----------memory---------- -----io---- --system-- ----cpu----
r b free buff cache bi bo in cs us sy id wa
16 95 46728928 573920 1487168 916 44 1259 35577 0 12 2 86
1 98 46829792 570080 1392160 860 52 1261 32797 0 12 4 85
10 94 46924640 576640 1292160 860 0 1251 33826 0 12 1 87
8 91 47017376 572576 1205824 808 24 1245 29650 0 10 2 87
2 90 47122592 579392 1096864 904 0 1256 35883 0 13 2 86
11 85 47241056 578784 978976 844 24 1247 36740 0 13 4 83
8 81 47411872 578720 815584 892 4 1260 46037 0 18 5 77
5 66 47588576 576832 651200 740 4 1229 42522 0 17 9 74
8 48 47851872 573696 413280 760 4 1237 43504 0 19 15 66
5 24 48114848 579616 164448 760 0 1244 31623 0 15 25 60
0 0 48296032 575456 21280 396 0 1155 7253 0 6 70 24
0 0 48296160 575456 21408 4 0 1028 88 0 1 99 0
0 0 48295456 575488 21664 44 0 1037 174 0 1 98 0
0 0 48295456 575488 21664 0 0 1025 20 0 0 100 0
0 0 48295456 575488 21664 0 0 1025 18 0 0 100 0
0 0 48295456 575488 21664 0 0 1025 22 0 0 100 0

2003-03-15 08:13:31

by Alex Tomas

[permalink] [raw]
Subject: Re: [Ext2-devel] Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

>>>>> Andrew Morton (AM) writes:

AM> Nope. What we're trying to measure here is pure in-memory lock
AM> contention, locked bus traffic, context switches, etc, etc. To
AM> do that we need to get the IO system out of the picture.

I simple use own pretty simple test. btw, you may disable preallocation
to increase allocation rate


bash-script:
============================================================
#!/bin/sh

# args:
# 1 - how many processes to create
# 2 - how many blocks in file to be written
# 3 - how many times to repeat (write+truncate)
# for example: cd.sh 2 32 100000

let i=0
while let "i < $1"; do
if [ ! -d /mnt/$i ]; then
mkdir /mnt/$i
fi
rm -rf /mnt/$i/*
let "i=i+1"
done

sync
sync

let i=0
while let "i < $1"; do
time /root/cdsingle $2 $3 /mnt/$i/1 &
let "i=i+1"
done

wait
============================================================

C programm, which does loop over writes and truncate:

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>


main (int argc, char ** argv)
{
int i, j, num, siz, err, k;
int fd[1024];

num = atoi(argv[2]);
siz = atoi(argv[1]);

for (i = 3; i < argc; i++) {
fd[i] = creat(argv[i], 0666);
if (fd[i] < 0) {
perror("can't create");
exit(1);
}
}

for (j = 0; j < num; j++) {
for (i = 3; i < argc; i++) {
for (k = 0; k < siz; k++)
if ((err = write(fd[i], main, 4096)) < 0) {
printf("err=%d\n", err);
perror("can't write");
exit(1);
}
}
for (i = 3; i < argc; i++) {
ftruncate(fd[i], 0);
lseek(fd[i], 0, SEEK_SET);
}
}

for(i = 3; i < argc; i++) {
close(fd[i]);
if (unlink(argv[i]) < 0)
perror("can't unlink");
}
}


2003-03-15 08:18:59

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [Ext2-devel] Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

Andrew Morton (AM) writes:
AM> Nope. What we're trying to measure here is pure in-memory lock
AM> contention, locked bus traffic, context switches, etc, etc. To
AM> do that we need to get the IO system out of the picture.

On Sat, Mar 15, 2003 at 11:16:10AM +0300, Alex Tomas wrote:
> I simple use own pretty simple test. btw, you may disable preallocation
> to increase allocation rate

This looks very interesting, but it may have to wait ca. 24 hours for
some benchmark time b/c of the long boot times and late hour in .us.

This also looks like it would be a much better stress test, and the
NUMA-Q is known for bringing out many rare races. There is are good
reasons to run this test even aside from performance.


-- wli

2003-03-15 08:29:49

by Alex Tomas

[permalink] [raw]
Subject: Re: [Ext2-devel] Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

>>>>> William Lee Irwin (WLI) writes:

>> I simple use own pretty simple test. btw, you may disable
>> preallocation to increase allocation rate

WLI> This looks very interesting, but it may have to wait ca. 24
WLI> hours for some benchmark time b/c of the long boot times and
WLI> late hour in .us.

WLI> This also looks like it would be a much better stress test, and
WLI> the NUMA-Q is known for bringing out many rare races. There is
WLI> are good reasons to run this test even aside from performance.

fine. it's really interesting to see results for so big iron.

2003-03-15 09:12:49

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [Ext2-devel] Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

>>>>> William Lee Irwin (WLI) writes:
>> I simple use own pretty simple test. btw, you may disable
>> preallocation to increase allocation rate

WLI> This looks very interesting, but it may have to wait ca. 24
WLI> hours for some benchmark time b/c of the long boot times and
WLI> late hour in .us.
WLI> This also looks like it would be a much better stress test, and
WLI> the NUMA-Q is known for bringing out many rare races. There is
WLI> are good reasons to run this test even aside from performance.

On Sat, Mar 15, 2003 at 11:32:28AM +0300, Alex Tomas wrote:
> fine. it's really interesting to see results for so big iron.
"
So maybe it's pointless to elaborate on this in particular, but...

I actually borrowed time on the extra quads (I have 4 that are primarily
used by me; these systems support static partitioning, so as long as the
cabling is done right, you can make 4 4 quad systems from 16 quads, or
2 8 quad systems from 16 quads, or 1 16 quad system from 16 quads, etc.;
the other 4 are actually primarily used by Dave Hansen, but he's been
tied up with tasks that need him to use other systems this week and so
lent them to me) for the purpose of hardening pgcl (my forward port of
Hugh Dickins' page clustering patch), but when the issue of lock
contention came up, I thought it would be a good idea to utilize the
elevated cpu count to highlight the lock contention you were trying to
address with this patch. I'd be more than happy to see an effective
case for it made or otherwise demonstrate its merits.

I guess it's mostly OT and/or organizational, but it might (for those
who are interested) give an idea of how the time on these larger
systems is spent. In this case, the larger system is dynamically put
together from two smaller systems when another kernel hacker isn't
focusing on that system and nicely cooperates to give other people time
to test/benchmark/etc. on the hardware that can be glued together with
stuff regularly used by some other kernel hacker to form a larger system.
To some it might sound inconvenient, but I'm grateful for every minute
of time I get on the things.

There are other situations or "typical patterns" for getting at the
larger systems. What's probably the most typical pattern of all is that
the vendors themselves can't afford the larger models of their own
machines for kernel hacking purposes, and so the hackers (and their
managers and other kinds of helpers) scramble to beg, borrow, and steal
time on such machines from whatever places they can.

I have no idea what possessed me to describe all this, but I'll go on.
And sorry that this is probably very irrelevant to you Alex, but:

To all those who help get me in front of these, things, i.e. Dave, Hans,
Martin, Gerrit, Hubertus, et al, thanks a million! I love hacking on
big boxen, and (at least from the above) it's clear I can't do it alone.


-- wli

2003-03-15 09:37:39

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

On Sat, Mar 15, 2003 at 12:24:31AM -0800, William Lee Irwin III wrote:
> Next pass involves lockmeter:

Throughput 39.2014 MB/sec 128 procs
dbench 128 142.51s user 10828.91s system 964% cpu 18:57.88 total

That's an 83% reduction in throughput from applying lockmeter.

Um, somebody should look into this. The thing is a bloody doorstop:

vma samples %-age symbol name
c012fbbd 20829312 51.3129 .text.lock.lockmeter
c012eb1c 3834281 9.44573 alloc_rwlock_struct
c0106f74 2940592 7.24413 default_idle
c025174c 2837008 6.98895 sync_buffer
c012ec58 1438542 3.54384 _metered_read_lock
c012e98c 1129385 2.78223 _metered_spin_lock
c012ea94 1044869 2.57403 _metered_spin_unlock
c012e89c 982225 2.41971 lstat_update
c012efd0 702482 1.73056 _metered_write_lock
c01cca10 657780 1.62044 __copy_to_user_ll
c012e6f0 587940 1.44839 lstat_lookup
c0251910 551723 1.35917 add_event_entry
c012ee70 512327 1.26211 _metered_read_unlock
c0109a30 298994 0.73657 apic_timer_interrupt
c01cca78 202482 0.498813 __copy_from_user_ll
c0120730 159350 0.392558 current_kernel_time
c012f148 133340 0.328482 _metered_write_unlock
c02516a8 127259 0.313502 add_sample
c0251634 112579 0.277338 add_sample_entry
c0118cac 102231 0.251845 scheduler_tick
c010f080 75857 0.186873 timer_interrupt

AFAICT the actual results are also garbage.


System: Linux curly 2.5.64 #1 SMP Sat Mar 15 00:49:53 PST 2003 i686
Total counts

All (32) CPUs

Start time: Sat Mar 15 01:20:18 2003
End time: Sat Mar 15 01:39:03 2003
Delta Time: 1129.85 sec.
Hash table slots in use: 216.
Global read lock slots in use: 999.

*************************** Warnings! ******************************
Read Lock table overflowed.

The data in this report may be in error due to this.
************************ End of Warnings! **************************


- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
SPINLOCKS HOLD WAIT
UTIL CON MEAN( MAX ) MEAN( MAX )(% CPU) TOTAL NOWAIT SPIN RJECT NAME

2.6% 0us 821511883 97.4% 0.15% 2.5% *TOTAL*

0% 0% 0us 179 100% 0% 0% [0xc0433810]
0% 0% 0us 179 100% 0% 0% uart_wait_modem_status+0x168

0% 0% 0us 179 100% 0% 0% [0xc0434260]
0% 0% 0us 179 100% 0% 0% uart_wait_modem_status+0x18c

0% 0% 0us 129 100% 0% 0% [0xee1c90f0]
0% 0% 0us 129 100% 0% 0% autofs4_dir_rmdir+0xa0

0% 0% 0us 1 100% 0% 0% [0xee66b020]
0% 0% 0us 1 100% 0% 0% inet_rtm_newaddr+0x54

0% 0% 0us 49 100% 0% 0% [0xef30b8d4]
0% 0% 0us 16 100% 0% 0% do_mmap_pgoff+0x468
0% 0% 0us 33 100% 0% 0% sys_mlockall+0x88

0% 3.7% 0us 46947 96.3% 3.7% 0% [0xef3c85f4]
0% 3.7% 0us 46947 96.3% 3.7% 0% sys_fchmod+0xb8

0% 0% 0us 227 100% 0% 0% [0xef53e994]
0% 0% 0us 227 100% 0% 0% scsi_dispatch_cmd+0x38

0% 0% 0us 2770 100% 0% 0% [0xef55011c]
0% 0% 0us 2218 100% 0% 0% __constant_c_and_count_memset+0x24
0% 0% 0us 552 100% 0% 0% _decode_session+0x130

0% 0% 0us 2218 100% 0% 0% [0xef550128]
0% 0% 0us 2218 100% 0% 0% _decode_session+0x14

0% 0% 0us 2 100% 0% 0% [0xef6455a0]
0% 0% 0us 1 100% 0% 0% filemap_sync+0xcc
0% 0% 0us 1 100% 0% 0% mprotect_fixup+0x14

0% 0% 0us 2 100% 0% 0% [0xf04c7df0]
0% 0% 0us 1 100% 0% 0% get_one_pte_map_nested+0x58
0% 0% 0us 1 100% 0% 0% insert_vm_struct+0xc

0% 3.0% 0us 1543963 97.0% 3.0% 0% __per_cpu_end+0xdc
0% 3.6% 0us 220 96.4% 3.6% 0% clear_inode+0x58
0% 1.9% 0us 258185 98.1% 1.9% 0% dentry_iput+0x24
0% 4.3% 0us 119875 95.7% 4.3% 0% do_pollfd+0x68
0% 2.6% 0us 258168 97.4% 2.6% 0% do_poll+0x7c
0% 3.2% 0us 385561 96.8% 3.2% 0% sys_poll+0x218
0% 3.4% 0us 258115 96.6% 3.4% 0% sys_select+0x7c
0% 0% 0us 5 100% 0% 0% sys_select+0x204
0% 3.2% 0us 263834 96.8% 3.2% 0% sys_select+0x340

0% 0% 0us 3 100% 0% 0% _binary_usr_initramfs_data_cpio_gz_end+0x13c
0% 0% 0us 2 100% 0% 0% do_rw_proc+0x24
0% 0% 0us 1 100% 0% 0% proc_readsys+0xc

0% 31.8% 0us 904982 68.2% 31.8% 0% log_buf+0x5c60
0% 0% 0us 3 100% 0% 0% .text.lock.eventpoll+0x6
0% 20.0% 0us 80 80.0% 20.0% 0% .text.lock.pageattr+0x3f
0% 35.8% 0us 2302 64.2% 35.8% 0% __register_serial+0x98
0% 33.3% 0us 3 66.7% 33.3% 0% aio_kick_handler+0x8
0% 100% 0us 2 0% 100% 0% block_truncate_page+0x20c
0% 31.9% 0us 138 68.1% 31.9% 0% copy_msqid_from_user+0xbc
0% 36.4% 0us 2176 63.6% 36.4% 0% ep_poll+0x34
0% 24.4% 0us 639 75.6% 24.4% 0% flock_make_lock+0x38
0% 36.5% 0us 639 63.5% 36.5% 0% flock_make_lock+0xb0
0% 34.6% 0us 2331 65.4% 34.6% 0% get_pci_port+0x14c
0% 44.0% 0us 639 56.0% 44.0% 0% locks_alloc_lock
0% 35.6% 0us 225 64.4% 35.6% 0% parse_extended+0x184
0% 12.7% 0us 71 87.3% 12.7% 0% pipe_write+0x258
0% 31.7% 0us 895062 68.3% 31.7% 0% proc_pid_make_inode+0x3c
0% 33.3% 0us 3 66.7% 33.3% 0% read_events+0xe8
0% 0% 0us 2 100% 0% 0% serial8250_type+0x8
0% 39.4% 0us 635 60.6% 39.4% 0% unuse_pmd+0x11c
0% 12.5% 0us 32 87.5% 12.5% 0% vfs_create+0x90

0% 0% 0us 2 100% 0% 0% log_buf+0x5c80
0% 0% 0us 2 100% 0% 0% kmap_atomic+0x8

0% 1.6% 0us 1045171 98.4% 1.6% 0% lru_add_active_pvecs__per_cpu+0x20
0% 1.6% 0us 1045150 98.4% 1.6% 0% bd_set_size+0x24
0% 0% 0us 21 100% 0% 0% bd_set_size+0x70

0% 0% 0us 2 100% 0% 0% pci_boards+0x22c
0% 0% 0us 2 100% 0% 0% mtrr_ioctl+0x4f8

0% 0% 0us 6 100% 0% 0% pci_boards+0x234
0% 0% 0us 6 100% 0% 0% __constant_c_and_count_memset+0x50

0% 0% 0us 1129503 100% 0% 0% pci_vendor_list+0x4200
0% 0% 0us 1129503 100% 0% 0% destroy_context+0x4c

0% 0% 0us 1129503 100% 0% 0% pci_vendor_list+0x47d0
0% 0% 0us 1129503 100% 0% 0% mtrr_write+0x194

0% 0% 0us 149 100% 0% 0% pid_hash+0x8680
0% 0% 0us 4 100% 0% 0% bio_add_page+0xa8
0% 0% 0us 140 100% 0% 0% load_balance+0x1e8
0% 0% 0us 5 100% 0% 0% scheduler_tick+0x180

0% 0% 0us 1129503 100% 0% 0% pid_hash+0x8824
0% 0% 0us 1129503 100% 0% 0% init_new_context+0xe0

0% 3.9% 0us 2900686 96.1% 3.9% 0% pid_hash+0x89c0
0% 4.0% 0us 1450343 96.0% 4.0% 0% __pagevec_lru_add+0xa4
0% 3.8% 0us 1450343 96.2% 3.8% 0% do_invalidatepage

0% 6.5% 0us 2985077 93.5% 6.5% 0% pid_hash+0x89e0
0% 6.8% 0us 1492680 93.2% 6.8% 0% sys_swapon+0xe4
0% 50.0% 0us 2 50.0% 50.0% 0% sys_swapon+0x1bc
0% 6.1% 0us 1492393 93.9% 6.1% 0% sys_swapon+0x204
0% 0% 0us 2 100% 0% 0% sys_swapon+0x244

0% 0% 0us 1089 100% 0% 0% pid_hash+0x8a00
0% 0% 0us 1089 100% 0% 0% __block_prepare_write+0x428

0% 7.6% 0us 4513471 92.4% 7.6% 0% pid_hash+0x8a20
0% 25.0% 0us 4929 75.0% 25.0% 0% .text.lock.namei+0x74
0% 8.8% 0us 324046 91.2% 8.8% 0% .text.lock.namei+0x1ac
0% 7.2% 0us 2491551 92.8% 7.2% 0% __follow_down+0x4c
0% 50.0% 0us 2 50.0% 50.0% 0% blkdev_put+0x110
0% 2.4% 0us 1185 97.6% 2.4% 0% count+0x28
0% 11.4% 0us 17982 88.6% 11.4% 0% do_fcntl+0x154
0% 0% 0us 8 100% 0% 0% do_open+0x258
0% 10.0% 0us 20 90.0% 10.0% 0% file_ioctl+0x138
0% 8.6% 0us 6114 91.4% 8.6% 0% locate_fd+0xd0
0% 7.6% 0us 265244 92.4% 7.6% 0% send_sigio+0x94
0% 3.5% 0us 368709 96.5% 3.5% 0% send_sigurg+0x24
0% 7.4% 0us 54539 92.6% 7.4% 0% send_sigurg+0x84
0% 10.6% 0us 368709 89.4% 10.6% 0% setfl+0xa8
0% 9.0% 0us 608895 91.0% 9.0% 0% setfl+0x110
0% 0% 0us 2 100% 0% 0% sys_ioctl+0xf0
0% 7.2% 0us 1536 92.8% 7.2% 0% sys_uselib+0x12c

0% 0% 0us 390 100% 0% 0% pid_hash+0x8a60
0% 0% 0us 195 100% 0% 0% csi_m+0x258
0% 0% 0us 195 100% 0% 0% cursor_report+0x2c

0% 21.7% 0us 25815 78.3% 21.7% 0% tvec_bases__per_cpu+0x24
0% 7.1% 0us 4725 92.9% 7.1% 0% in_group_p+0xc
0% 24.9% 0us 21090 75.1% 24.9% 0% sys_getgroups+0x38

0% 0% 0us 1 100% 0% 0% tvec_bases__per_cpu+0x554
0% 0% 0us 1 100% 0% 0% .text.lock.mprotect+0x41

0% 0% 0us 1089 100% 0% 0% tvec_bases__per_cpu+0xac0
0% 0% 0us 1089 100% 0% 0% get_vm_area+0x4c

0% 0% 0us 1770 100% 0% 0% tvec_bases__per_cpu+0xc70
0% 0% 0us 225 100% 0% 0% __getblk_slow+0x40
0% 0% 0us 445 100% 0% 0% __getblk_slow+0x80
0% 0% 0us 220 100% 0% 0% clear_inode+0x68
0% 0% 0us 880 100% 0% 0% clear_inode+0xbc

0% 0% 0us 69 100% 0% 0% .text.lock.mempool+0x4a
0% 0.00% 0us 670256 100% 0.00% 0% .text.lock.namei+0xac
0% 0% 0us 587 100% 0% 0% __bounce_end_io_read+0x38
0% 0% 0us 140 100% 0% 0% __constant_memcpy+0x10
0% 0.99% 0us 829218 99.0% 0.99% 0% __constant_memcpy+0xf0
0% 0.00% 0us 2491547 100% 0.00% 0% __follow_down+0x60
0% 0.94% 0us 21329 99.1% 0.94% 0% __free_pages_bulk+0x2c
0% 100% 0us 2119165 0% 0% 100% __free_pages_bulk+0x88
0% 57.2% 0us 30598 42.8% 57.2% 0% __ioremap+0x20
0% 0.05% 0us 30598 100% 0.05% 0% __ioremap+0x2c
0% 78.7% 0us 7429 21.3% 78.7% 0% __ioremap+0x68
0% 0.01% 0us 28980 100% 0.01% 0% __set_page_dirty_buffers+0x110
0% 3.5% 0us 20255 96.5% 3.5% 0% balance_dirty_pages+0xb8
0% 0% 0us 4 100% 0% 0% bio_add_page+0x104
0% 0% 0us 4 100% 0% 0% bio_alloc+0xf8
0% 0% 0us 23110 100% 0% 0% cpu_raise_softirq+0x8
0% 0% 0us 142 100% 0% 0% create_workqueue+0x144
0% 0% 0us 327491 100% 0% 0% dentry_open+0x14c
0% 0% 0us 7 100% 0% 0% do_anonymous_page+0x268
0% 0% 0us 7 100% 0% 0% do_page_fault+0xfc
0% 0.71% 0us 203785 99.3% 0.71% 0% do_proc_readlink+0x38
0% 0% 0us 901 100% 0% 0% do_wp_page+0x2cc
0% 0% 0us 22 100% 0% 0% do_wp_page+0x45c
0% 0% 0us 138 100% 0% 0% dup_mmap+0x120
0% 0% 0us 138 100% 0% 0% dup_mmap+0x158
0% 0% 0us 138 100% 0% 0% dup_mmap+0x220
0% 0% 0us 138 100% 0% 0% dup_mmap+0xc8
0% 0.34% 0us 1782 99.7% 0.34% 0% frag_show+0x40
0% 0% 0us 1019899 100% 0% 0% free_buffer_head+0x34
0% 0% 0us 14 100% 0% 0% free_one_pmd+0x168
0% 0.01% 0us 11129 100% 0.01% 0% get_dirty_limits+0x3c
0% 0% 0us 392486 100% 0% 0% get_empty_filp+0x12c
0% 0% 0us 838 100% 0% 0% handle_mm_fault+0xe0
0% 0% 0us 14 100% 0% 0% hugetlb_report_meminfo+0x34
0% 0% 0us 1185123 100% 0% 0% init_buffer_head+0x4c
0% 0% 0us 7 100% 0% 0% kmap_atomic+0x14
0% 0% 0us 75 100% 0% 0% ksoftirqd+0x100
0% 0% 0us 77 100% 0% 0% ksoftirqd+0x10c
0% 0% 0us 152 100% 0% 0% ksoftirqd+0x114
0% 0% 0us 5 100% 0% 0% kunmap+0x20
0% 0.22% 0us 487201 99.8% 0.22% 0% mem_open+0x8
0% 0.65% 0us 510091 99.4% 0.65% 0% mounts_release+0x8
0% 0% 0us 2 100% 0% 0% number+0x1fc
0% 0% 0us 138 100% 0% 0% proc_doutsstring+0x78
0% 0.39% 0us 1275 99.6% 0.39% 0% proc_info_read+0x98
0% 0.38% 0us 675021 99.6% 0.38% 0% proc_pid_cmdline+0x54
0% 0% 0us 714 100% 0% 0% pte_alloc_kernel+0x74
0% 12.2% 0us 312871 87.8% 0% 12.2% remap_area_pages+0x1f0
0% 0% 0us 8 100% 0% 0% risc_code01+0x1490
0% 0% 0us 2186 100% 0% 0% risc_code01+0x1b24
0% 0.13% 0us 2225 99.9% 0.13% 0% risc_code01+0x4f9c
0% 0% 0us 129 100% 0% 0% scheduler_tick+0x2e0
0% 0.03% 0us 1282523 100% 0.03% 0% search_exception_table+0x3c
0% 0% 0us 265244 100% 0% 0% send_sigio+0xa0
0% 0.94% 0us 14301378 99.1% 0.94% 0% send_sigio_to_task+0x30
0% 0% 0us 54539 100% 0% 0% send_sigurg+0x94
0% 0% 0us 4 100% 0% 0% sget+0x30
0% 0% 0us 129 100% 0% 0% shrink_cache+0x264
0% 0% 0us 30 100% 0% 0% shrink_list+0x420
0% 0.00% 0us 1132765 100% 0.00% 0% simd_math_error+0x28
0% 0% 0us 1132765 100% 0% 0% simd_math_error+0x94
0% 0.10% 0us 10704360 99.9% 0.10% 0% split_large_page+0x4
0% 2.5% 0us 11583 97.5% 2.5% 0% sys_access+0x150
0% 3.0% 0us 41401 97.0% 3.0% 0% sys_fstatfs+0x18
0% 0% 0us 14 100% 0% 0% sys_mincore+0x138
0% 0% 0us 4 100% 0% 0% sys_semtimedop+0x3fc
0% 0% 0us 174724 100% 0% 0% udp_queue_rcv_skb+0x5c
0% 2.4% 0us 761925513 97.6% 0% 2.4% udp_recvmsg+0x26c
0% 3.9% 0us 1705264 96.1% 3.9% 0% valid_swaphandles+0x4c
0% 1.2% 0us 1782 98.8% 1.2% 0% vmstat_next+0x38
0% 50.0% 0us 2 50.0% 50.0% 0% vsnprintf+0x20
0% 0% 0us 2518 100% 0% 0% zap_pte_range+0x2e0
0% 0% 0us 805 100% 0% 0% zap_pte_range+0x9c

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RWLOCK READS HOLD MAX RDR BUSY PERIOD WAIT
UTIL CON MEAN RDRS MEAN( MAX ) MEAN( MAX )( %CPU) TOTAL NOWAIT SPIN NAME

0.02% 0us 26339364 100% 0.02% *TOTAL*

0.01% 0% 6.7us 1 6.7us( 628us) 0us 2196 100% 0% [0xc1be51c4]
0% 0us 2196 100% 0% do_pipe+0x1bc

0.00% 0% 11.5us 1 11us( 130us) 0us 3 100% 0% [0xee6ae8a4]
0% 0us 3 100% 0% __constant_memcpy+0x68

0.00% 0% 13.0us 1 13us( 286us) 0us 1099 100% 0% [0xee8c21c0]
0% 0us 1099 100% 0% risc_code01+0x4e1c

55.9% 0% 10.4us 8 271us( 231ms) 0us 1 100% 0% [0xef3c858c]
0% 0us 1 100% 0% sys_setgroups16+0xcc

0.04% 0% 19.1us 1 19us( 481us) 0us 1 100% 0% [0xf057b3a4]
0% 0us 1 100% 0% sys_ioctl+0x38

0.03% 0% 15.8us 1 16us( 337us) 0us 1 100% 0% [0xf057d364]
0% 0us 1 100% 0% set_user_nice

0.01% 0% 6.0us 2 6.1us( 337us) 0us 11056 100% 0% __per_cpu_end+0x1c
0% 0us 11056 100% 0% pipe_write+0x6c

0.07% 0% 202.6us 1 203us(1013us) 0us 3652 100% 0% pid_hash+0x8660
0% 0us 18 100% 0% copy_files+0xd4
0% 0us 49 100% 0% internal_add_timer+0x98
0% 0us 1130 100% 0% sys_capset+0x64
0% 0us 2455 100% 0% sys_personality+0x30

93.4% 0.08% 7.9us 5 354us( 232ms) 0us 3151445 100% 0.08% pid_hash+0x8a40
0% 0us 639 100% 0% lease_alloc+0x30
0% 0us 7 100% 0% setup_swap_extents+0xa8
0.08% 0us 2264074 100% 0.08% try_to_unuse+0xb8
0.08% 0us 886725 100% 0.08% try_to_unuse+0x2a8

1620% 0% 7.5us 15 2027us( 232ms) 0us 13185756 100% 0% serial_pci_tbl+0xb8c
0% 0us 13185756 100% 0% mm_init+0xb8

0% 0us 20917 100% 0% buffered_rmqueue+0x38
0% 0us 16081 100% 0% find_local_symbol+0x8
0% 0us 2513995 100% 0% get_chrfops+0x15c
0% 0us 45266 100% 0% proc_pid_stat+0x120
0.05% 0us 2864093 100% 0.05% sys_getgroups16+0x5c
0.01% 0us 76659 100% 0.01% sys_setgroups16+0x64
0% 0us 4447143 100% 0% sys_swapon+0x16c

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RWLOCK WRITES HOLD WAIT (ALL) WAIT (WW)
UTIL CON MEAN( MAX ) MEAN( MAX )( %CPU) MEAN( MAX ) TOTAL NOWAIT SPIN( WW ) NAME

0% 0us 0us 16216231 100% 0%( 0%) *TOTAL*

0% 0% 0us 0us 1 100% 0%( 0%) [0xe0fff12c]
0% 0% 0us 0us 1 100% 0%( 0%) task_name+0xb0

0% 0% 0us 0us 3 100% 0%( 0%) [0xee6ae8a4]
0% 0% 0us 0us 1 100% 0%( 0%) generic_shutdown_super+0x84
0% 0% 0us 0us 1 100% 0%( 0%) locks_wake_up_blocks+0x58
0% 0% 0us 0us 1 100% 0%( 0%) schedule+0x278

0% 0% 0us 0us 4 100% 0%( 0%) [0xee78d7c4]
0% 0% 0us 0us 4 100% 0%( 0%) free_dma+0x1c

0% 0% 0us 0us 428518 100% 0%( 0%) __per_cpu_end+0x1c8
0% 0% 0us 0us 428518 100% 0%( 0%) posix_test_lock+0x28

0% 0% 0us 0us 35 100% 0%( 0%) cpu_devices+0xb48
0% 0% 0us 0us 35 100% 0%( 0%) fn_hash_delete+0x1a8

0% 0% 0us 0us 12 100% 0%( 0%) memblk_devices+0x904
0% 0% 0us 0us 6 100% 0%( 0%) __func__.1+0x130d8
0% 0% 0us 0us 6 100% 0%( 0%) __func__.1+0x135a4

0% 0% 0us 0us 51985 100% 0%( 0%) pid_hash+0x8a40
0% 0% 0us 0us 51985 100% 0%( 0%) __kill_fasync+0x1c

0% 0% 0us 0us 6 100% 0%( 0%) __func__.1+0x13104
0% 0% 0us 0us 6 100% 0%( 0%) __func__.1+0x13d24
0% 0% 0us 0us 21 100% 0%( 0%) badness+0x30
0% 0% 0us 0us 338541 100% 0%( 0%) badness+0xdc
0% 0% 0us 0us 6 100% 0%( 0%) de_thread+0x25c
0% 0% 0us 0us 447544 100% 0%( 0%) de_thread+0x30
0% 0% 0us 0us 105 100% 0%( 0%) fn_hash_delete+0x220
0% 0% 0us 0us 4 100% 0%( 0%) generic_shutdown_super+0x28
0% 0% 0us 0us 1825042 100% 0%( 0%) get_swap_page+0x8c
0% 0% 0us 0us 20 100% 0%( 0%) ifind+0x5c
0% 0% 0us 0us 25 100% 0%( 0%) inode_change_ok+0x98
0% 0% 0us 0us 2 100% 0%( 0%) lock_get_status+0x54
0% 0% 0us 0us 392487 100% 0%( 0%) param_get_intarray+0x5c
0% 0% 0us 0us 20 100% 0%( 0%) param_set_copystring+0x4c
0% 0% 0us 0us 510098 100% 0%( 0%) proc_pid_follow_link+0x38
0% 0% 0us 0us 2063912 100% 0%( 0%) proc_pid_maps_get_line+0x128
0% 0% 0us 0us 2063900 100% 0%( 0%) proc_pid_maps_get_line+0x48
0% 0% 0us 0us 2332202 100% 0%( 0%) proc_pid_status+0x120
0% 0% 0us 0us 2105301 100% 0%( 0%) proc_pid_status+0x184
0% 0% 0us 0us 1492685 100% 0%( 0%) remove_exclusive_swap_page+0xc
0% 0% 0us 0us 1492406 100% 0%( 0%) swap_entry_free+0x8
0% 0% 0us 0us 332694 100% 0%( 0%) swap_info_get+0xe0
0% 0% 0us 0us 338646 100% 0%( 0%) sys_fchown16+0x24
_________________________________________________________________________________________________________________________
Number of read locks found=10

Hanna, I suspect you're not to blame, but rather the global lock...


-- wli

2003-03-15 11:48:32

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

On Sat, Mar 15, 2003 at 12:24:31AM -0800, William Lee Irwin III wrote:
>> Next pass involves lockmeter:

On Sat, Mar 15, 2003 at 01:47:58AM -0800, William Lee Irwin III wrote:
> Throughput 39.2014 MB/sec 128 procs
> dbench 128 142.51s user 10828.91s system 964% cpu 18:57.88 total
> That's an 83% reduction in throughput from applying lockmeter.
> Um, somebody should look into this. The thing is a bloody doorstop:

Okay, dump_stack() every once in a while when we schedule() in down().

No good ideas how to script the results so I have the foggiest idea
who's the bad guy. gzipped and MIME attached (Sorry!) for space reasons.


-- wli


Attachments:
(No filename) (640.00 B)
brief message
sem.log.gz (5.63 kB)
sem.log.gz
Download all attachments

2003-03-15 11:58:16

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

William Lee Irwin III <[email protected]> wrote:
>
> On Sat, Mar 15, 2003 at 12:24:31AM -0800, William Lee Irwin III wrote:
> >> Next pass involves lockmeter:
>
> On Sat, Mar 15, 2003 at 01:47:58AM -0800, William Lee Irwin III wrote:
> > Throughput 39.2014 MB/sec 128 procs
> > dbench 128 142.51s user 10828.91s system 964% cpu 18:57.88 total
> > That's an 83% reduction in throughput from applying lockmeter.
> > Um, somebody should look into this. The thing is a bloody doorstop:
>
> Okay, dump_stack() every once in a while when we schedule() in down().

Thanks.

> No good ideas how to script the results so I have the foggiest idea
> who's the bad guy. gzipped and MIME attached (Sorry!) for space reasons.

lock_super() in the ext2 inode allocator mainly. It needs the same treatment.

2003-03-15 12:15:21

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] concurrent block allocation for ext2 against 2.5.64

On Sat, Mar 15, 2003 at 12:24:31AM -0800, William Lee Irwin III wrote:
>> Okay, dump_stack() every once in a while when we schedule() in down().

On Sat, Mar 15, 2003 at 04:08:19AM -0800, Andrew Morton wrote:
> Thanks.

No problem. I think we found out a number of things that help everyone.


On Sat, Mar 15, 2003 at 12:24:31AM -0800, William Lee Irwin III wrote:
>> No good ideas how to script the results so I have the foggiest idea
>> who's the bad guy. gzipped and MIME attached (Sorry!) for space reasons.

On Sat, Mar 15, 2003 at 04:08:19AM -0800, Andrew Morton wrote:
> lock_super() in the ext2 inode allocator mainly. It needs the same treatment.

Terrific! Not only have we resolved 16x ext2 contention issues we've
also identified a clear direction for 32x!!

Go fs hackers go! First 2.5 VM, now 2.6/2.7 VFS. What can't you do?


-- wli