I posted the initial version of this last week. There was no discussion about the patch itself. I can only hope that no news is good news :)
My first version of the patch reinitialized i_meta_lock for every read_inode call. This is not a correct way to do it. However, 2.4 does not have the capability to have fs-specific init_inode_once() functions. This probably meant altering the sb_ops structure if I wanted ext2_inode_info-specific initialization. To make the patch less intrusive, I put i_meta_lock right into the inode structure. I think that this is a good compromize between keeping ext2 code separated from VFS and patch simplicity. The lock is now initialized once per inode in fs/inode.c's init_once().
I noticed the extra lock initializations because I was using lockmeter and kernprof while running dbench. I was seeing a large throughput decrease when my patch was applied, and kernprof told me that a big chunk of CPU time was being spent in lockmeter's rwlock_alloc. I have the feeling that I've hit a bottleneck in lockmeter's code spinning on locks which protect lockmeter's data structures.
The patch is against 2.4.19-pre2. The patch significantly lowers BKL contention (50%) on a 2-way PII-300 running dbench 4. Dbench throughput is not significantly affected, but that is probably a function of my puny little machine more than the effectiveness of the patch. I'll have some results on a much more beefy 8-way PIII tomorrow. Earlier versions of the patch reduced BKL contention during dbench by 60% on the 8-way. CPU utilization spinning on the BKL has been as high as 40%.
--
Dave Hansen
[email protected]
diff -ur -X 2.5/exclude linux-2.4.19-pre2-clean/fs/ext2/balloc.c linux/fs/ext2/balloc.c
--- linux-2.4.19-pre2-clean/fs/ext2/balloc.c Tue Mar 5 15:18:38 2002
+++ linux/fs/ext2/balloc.c Tue Mar 5 15:16:57 2002
@@ -539,6 +539,7 @@
*/
#ifdef EXT2_PREALLOCATE
/* Writer: ->i_prealloc* */
+ write_lock(&inode->i_meta_lock);
if (prealloc_count && !*prealloc_count) {
int prealloc_goal;
unsigned long next_block = tmp + 1;
@@ -576,8 +577,8 @@
ext2_debug ("Preallocated a further %lu bits.\n",
(k - 1));
}
+ write_unlock(&inode->i_meta_lock);
#endif
-
j = tmp;
mark_buffer_dirty(bh);
diff -ur -X 2.5/exclude linux-2.4.19-pre2-clean/fs/ext2/inode.c linux/fs/ext2/inode.c
--- linux-2.4.19-pre2-clean/fs/ext2/inode.c Tue Mar 5 15:18:38 2002
+++ linux/fs/ext2/inode.c Tue Mar 5 15:16:57 2002
@@ -51,8 +51,6 @@
*/
void ext2_delete_inode (struct inode * inode)
{
- lock_kernel();
-
if (is_bad_inode(inode) ||
inode->i_ino == EXT2_ACL_IDX_INO ||
inode->i_ino == EXT2_ACL_DATA_INO)
@@ -60,6 +58,8 @@
inode->u.ext2_i.i_dtime = CURRENT_TIME;
mark_inode_dirty(inode);
ext2_update_inode(inode, IS_SYNC(inode));
+
+ lock_kernel();
inode->i_size = 0;
if (inode->i_blocks)
ext2_truncate (inode);
@@ -68,24 +68,26 @@
unlock_kernel();
return;
no_delete:
- unlock_kernel();
clear_inode(inode); /* We must guarantee clearing of inode... */
}
void ext2_discard_prealloc (struct inode * inode)
{
#ifdef EXT2_PREALLOCATE
- lock_kernel();
+ write_lock(&inode->i_meta_lock);
/* Writer: ->i_prealloc* */
if (inode->u.ext2_i.i_prealloc_count) {
unsigned short total = inode->u.ext2_i.i_prealloc_count;
unsigned long block = inode->u.ext2_i.i_prealloc_block;
inode->u.ext2_i.i_prealloc_count = 0;
inode->u.ext2_i.i_prealloc_block = 0;
+ write_unlock(&inode->i_meta_lock);
/* Writer: end */
ext2_free_blocks (inode, block, total);
+ } else {
+ write_unlock(&inode->i_meta_lock);
}
- unlock_kernel();
+
#endif
}
@@ -99,6 +101,7 @@
#ifdef EXT2_PREALLOCATE
/* Writer: ->i_prealloc* */
+ write_lock(&inode->i_meta_lock);
if (inode->u.ext2_i.i_prealloc_count &&
(goal == inode->u.ext2_i.i_prealloc_block ||
goal + 1 == inode->u.ext2_i.i_prealloc_block))
@@ -106,9 +109,11 @@
result = inode->u.ext2_i.i_prealloc_block++;
inode->u.ext2_i.i_prealloc_count--;
/* Writer: end */
+ write_unlock(&inode->i_meta_lock);
ext2_debug ("preallocation hit (%lu/%lu).\n",
++alloc_hits, ++alloc_attempts);
} else {
+ write_unlock(&inode->i_meta_lock);
ext2_discard_prealloc (inode);
ext2_debug ("preallocation miss (%lu/%lu).\n",
alloc_hits, ++alloc_attempts);
@@ -253,9 +258,11 @@
if (!bh)
goto failure;
/* Reader: pointers */
+ read_lock(&inode->i_meta_lock);
if (!verify_chain(chain, p))
goto changed;
add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
+ read_unlock(&inode->i_meta_lock);
/* Reader: end */
if (!p->key)
goto no_block;
@@ -263,6 +270,7 @@
return NULL;
changed:
+ read_unlock(&inode->i_meta_lock);
*err = -EAGAIN;
goto no_block;
failure:
@@ -328,6 +336,8 @@
unsigned long *goal)
{
/* Writer: ->i_next_alloc* */
+
+ write_lock(&inode->i_meta_lock);
if (block == inode->u.ext2_i.i_next_alloc_block + 1) {
inode->u.ext2_i.i_next_alloc_block++;
inode->u.ext2_i.i_next_alloc_goal++;
@@ -343,9 +353,11 @@
*goal = inode->u.ext2_i.i_next_alloc_goal;
if (!*goal)
*goal = ext2_find_near(inode, partial);
+ write_unlock(&inode->i_meta_lock);
return 0;
}
/* Reader: end */
+ write_unlock(&inode->i_meta_lock);
return -EAGAIN;
}
@@ -451,6 +463,7 @@
/* Verify that place we are splicing to is still there and vacant */
+ write_lock(&inode->i_meta_lock);
/* Writer: pointers, ->i_next_alloc* */
if (!verify_chain(chain, where-1) || *where->p)
/* Writer: end */
@@ -461,7 +474,7 @@
*where->p = where->key;
inode->u.ext2_i.i_next_alloc_block = block;
inode->u.ext2_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
-
+ write_unlock(&inode->i_meta_lock);
/* Writer: end */
/* We are done with atomic stuff, now do the rest of housekeeping */
@@ -484,6 +497,7 @@
return 0;
changed:
+ write_unlock(&inode->i_meta_lock);
for (i = 1; i < num; i++)
bforget(where[i].bh);
for (i = 0; i < num; i++)
@@ -517,7 +531,6 @@
if (depth == 0)
goto out;
- lock_kernel();
reread:
partial = ext2_get_branch(inode, depth, offsets, chain, &err);
@@ -539,7 +552,6 @@
brelse(partial->bh);
partial--;
}
- unlock_kernel();
out:
return err;
}
@@ -1149,9 +1161,7 @@
void ext2_write_inode (struct inode * inode, int wait)
{
- lock_kernel();
ext2_update_inode (inode, wait);
- unlock_kernel();
}
int ext2_sync_inode (struct inode *inode)
diff -ur -X 2.5/exclude linux-2.4.19-pre2-clean/fs/inode.c linux/fs/inode.c
--- linux-2.4.19-pre2-clean/fs/inode.c Fri Dec 21 09:41:55 2001
+++ linux/fs/inode.c Fri Mar 1 14:47:32 2002
@@ -110,6 +110,7 @@
sema_init(&inode->i_sem, 1);
sema_init(&inode->i_zombie, 1);
spin_lock_init(&inode->i_data.i_shared_lock);
+ rwlock_init(&inode->i_meta_lock);
}
}
diff -ur -X 2.5/exclude linux-2.4.19-pre2-clean/include/linux/fs.h linux/include/linux/fs.h
--- linux-2.4.19-pre2-clean/include/linux/fs.h Tue Mar 5 15:19:42 2002
+++ linux/include/linux/fs.h Tue Mar 5 15:17:47 2002
@@ -452,6 +452,7 @@
unsigned long i_version;
struct semaphore i_sem;
struct semaphore i_zombie;
+ rwlock_t i_meta_lock;
struct inode_operations *i_op;
struct file_operations *i_fop; /* former ->i_op->default_file_ops */
struct super_block *i_sb;
Alexander Viro wrote:
> Denied. You can trivially do that in ext2_read_inode() and ext2_new_inode().
Good point. That makes it much easier.
> ext2 patches _MUST_ get testing in 2.5 before they can go into 2.4. At
> the very least a month, preferably - two. Until then consider them vetoed
> for 2.4, no matter how BKL brigade feels about their crusade.
[email protected] 2002-02-11 21:26:50-08:00 [email protected]
So, it has been almost a month. Do you plan to port these changes back
to 2.4 yourself?
--
Dave Hansen
[email protected]
> The patch is against 2.4.19-pre2. The patch significantly lowers BKL contention (50%) on a 2-way PII-300 running dbench 4. Dbench throughput is not significantly affected, but that is probably a function of my puny little machine more than the effectiveness of the patch. I'll have some results on a much more beefy 8-way PIII tomorrow. Earlier versions of the patch reduced BKL contention during dbench by 60% on the 8-way. CPU utilization spinning on the BKL has been as high as 40%.
I certainly am not interested in it. 2.4 locking changes for very big boxes
strike me as a little dangerous.
On Tue, 5 Mar 2002, Dave Hansen wrote:
First of all, learn to use fscking line breaks.
> I posted the initial version of this last week. There was no discussion about the patch itself. I can only hope that no news is good news :)
>
> My first version of the patch reinitialized i_meta_lock for every read_inode call. This is not a correct way to do it. However, 2.4 does not have the capability to have fs-specific init_inode_once() functions. This probably meant altering the sb_ops structure if I wanted ext2_inode_info-specific initialization. To make the patch less intrusive, I put i_meta_lock right into the inode structure. I think that this is a good compromize between keeping ext2 code separated from VFS and patch simplicity. The lock is now initialized once per inode in fs/inode.c's init_once().
Denied. You can trivially do that in ext2_read_inode() and ext2_new_inode().
> I noticed the extra lock initializations because I was using lockmeter and kernprof while running dbench. I was seeing a large throughput decrease when my patch was applied, and kernprof told me that a big chunk of CPU time was being spent in lockmeter's rwlock_alloc. I have the feeling that I've hit a bottleneck in lockmeter's code spinning on locks which protect lockmeter's data structures.
>
> The patch is against 2.4.19-pre2. The patch significantly lowers BKL contention (50%) on a 2-way PII-300 running dbench 4. Dbench throughput is not significantly affected, but that is probably a function of my puny little machine more than the effectiveness of the patch. I'll have some results on a much more beefy 8-way PIII tomorrow. Earlier versions of the patch reduced BKL contention during dbench by 60% on the 8-way. CPU utilization spinning on the BKL has been as high as 40%.
ext2 patches _MUST_ get testing in 2.5 before they can go into 2.4. At
the very least a month, preferably - two. Until then consider them vetoed
for 2.4, no matter how BKL brigade feels about their crusade.
Alan Cox wrote:
> I certainly am not interested in it. 2.4 locking changes for very big boxes
> strike me as a little dangerous.
I think that I may have presented the patch in the wrong way. The
primary reason I'm doing this is BKL removal. The ext2 code just
happened to be one of the worst offenders that I'd run into. I've been
using the 8-way with dbench because it produces a the worst-case
scenario I can think of. I also like watching it compile kernels in
just over a minute. :)
This patch was also a backport of an Al Viro 2.5 change, so I consider
it pretty safe. Only time and testing can tell, but I've tested it
about as much as I can.
All of this becomes pretty academic if Al decides that he will backport
the 2.5 changes, which we all want to see.
--
Dave Hansen
[email protected]
Mike Fedyk wrote:
> Also, there seems to be a plan already in progress to do this on
> 2.5. BLK has been moved around in several filesystems to ease
> transition to another lock.
As I indicated elsewhere in the thread, this _is_ a 2.5 backport
chosen because of the tremendous impact on BKL contention.
--
Dave Hansen
[email protected]
Dave Hansen wrote:
>
> Alexander Viro wrote:
> > Denied. You can trivially do that in ext2_read_inode() and ext2_new_inode().
> Good point. That makes it much easier.
>
> > ext2 patches _MUST_ get testing in 2.5 before they can go into 2.4. At
> > the very least a month, preferably - two. Until then consider them vetoed
> > for 2.4, no matter how BKL brigade feels about their crusade.
> [email protected] 2002-02-11 21:26:50-08:00 [email protected]
>
> So, it has been almost a month. Do you plan to port these changes back
> to 2.4 yourself?
>
2.5.5's ext2 has an SMP race, probably in get_block, which results in
the freeing of already-free buffers. That has not yet been found and
fixed.
-
diff -ur -X 2.5/exclude linux-2.4.19-pre2-clean/fs/ext2/balloc.c linux/fs/ext2/balloc.c
--- linux-2.4.19-pre2-clean/fs/ext2/balloc.c Tue Mar 5 15:18:38 2002
+++ linux/fs/ext2/balloc.c Tue Mar 5 17:04:32 2002
@@ -539,6 +539,7 @@
*/
#ifdef EXT2_PREALLOCATE
/* Writer: ->i_prealloc* */
+ write_lock(&inode->u.ext2_i.i_meta_lock);
if (prealloc_count && !*prealloc_count) {
int prealloc_goal;
unsigned long next_block = tmp + 1;
@@ -576,8 +577,8 @@
ext2_debug ("Preallocated a further %lu bits.\n",
(k - 1));
}
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
#endif
-
j = tmp;
mark_buffer_dirty(bh);
diff -ur -X 2.5/exclude linux-2.4.19-pre2-clean/fs/ext2/ialloc.c linux/fs/ext2/ialloc.c
--- linux-2.4.19-pre2-clean/fs/ext2/ialloc.c Tue Mar 5 15:18:38 2002
+++ linux/fs/ext2/ialloc.c Tue Mar 5 17:05:29 2002
@@ -392,6 +392,7 @@
inode->u.ext2_i.i_block_group = group;
if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL)
inode->i_flags |= S_SYNC;
+ rwlock_init(&inode->u.ext2_i.i_meta_lock);
insert_inode_hash(inode);
inode->i_generation = event++;
mark_inode_dirty(inode);
diff -ur -X 2.5/exclude linux-2.4.19-pre2-clean/fs/ext2/inode.c linux/fs/ext2/inode.c
--- linux-2.4.19-pre2-clean/fs/ext2/inode.c Tue Mar 5 15:18:38 2002
+++ linux/fs/ext2/inode.c Tue Mar 5 19:41:56 2002
@@ -51,8 +51,6 @@
*/
void ext2_delete_inode (struct inode * inode)
{
- lock_kernel();
-
if (is_bad_inode(inode) ||
inode->i_ino == EXT2_ACL_IDX_INO ||
inode->i_ino == EXT2_ACL_DATA_INO)
@@ -60,6 +58,8 @@
inode->u.ext2_i.i_dtime = CURRENT_TIME;
mark_inode_dirty(inode);
ext2_update_inode(inode, IS_SYNC(inode));
+
+ lock_kernel();
inode->i_size = 0;
if (inode->i_blocks)
ext2_truncate (inode);
@@ -68,24 +68,26 @@
unlock_kernel();
return;
no_delete:
- unlock_kernel();
clear_inode(inode); /* We must guarantee clearing of inode... */
}
void ext2_discard_prealloc (struct inode * inode)
{
#ifdef EXT2_PREALLOCATE
- lock_kernel();
+ write_lock(&inode->u.ext2_i.i_meta_lock);
/* Writer: ->i_prealloc* */
if (inode->u.ext2_i.i_prealloc_count) {
unsigned short total = inode->u.ext2_i.i_prealloc_count;
unsigned long block = inode->u.ext2_i.i_prealloc_block;
inode->u.ext2_i.i_prealloc_count = 0;
inode->u.ext2_i.i_prealloc_block = 0;
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
/* Writer: end */
ext2_free_blocks (inode, block, total);
+ } else {
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
}
- unlock_kernel();
+
#endif
}
@@ -99,6 +101,7 @@
#ifdef EXT2_PREALLOCATE
/* Writer: ->i_prealloc* */
+ write_lock(&inode->u.ext2_i.i_meta_lock);
if (inode->u.ext2_i.i_prealloc_count &&
(goal == inode->u.ext2_i.i_prealloc_block ||
goal + 1 == inode->u.ext2_i.i_prealloc_block))
@@ -106,9 +109,11 @@
result = inode->u.ext2_i.i_prealloc_block++;
inode->u.ext2_i.i_prealloc_count--;
/* Writer: end */
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
ext2_debug ("preallocation hit (%lu/%lu).\n",
++alloc_hits, ++alloc_attempts);
} else {
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
ext2_discard_prealloc (inode);
ext2_debug ("preallocation miss (%lu/%lu).\n",
alloc_hits, ++alloc_attempts);
@@ -253,9 +258,11 @@
if (!bh)
goto failure;
/* Reader: pointers */
+ read_lock(&inode->u.ext2_i.i_meta_lock);
if (!verify_chain(chain, p))
goto changed;
add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
+ read_unlock(&inode->u.ext2_i.i_meta_lock);
/* Reader: end */
if (!p->key)
goto no_block;
@@ -263,6 +270,7 @@
return NULL;
changed:
+ read_unlock(&inode->u.ext2_i.i_meta_lock);
*err = -EAGAIN;
goto no_block;
failure:
@@ -328,6 +336,8 @@
unsigned long *goal)
{
/* Writer: ->i_next_alloc* */
+
+ write_lock(&inode->u.ext2_i.i_meta_lock);
if (block == inode->u.ext2_i.i_next_alloc_block + 1) {
inode->u.ext2_i.i_next_alloc_block++;
inode->u.ext2_i.i_next_alloc_goal++;
@@ -343,9 +353,11 @@
*goal = inode->u.ext2_i.i_next_alloc_goal;
if (!*goal)
*goal = ext2_find_near(inode, partial);
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
return 0;
}
/* Reader: end */
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
return -EAGAIN;
}
@@ -451,6 +463,7 @@
/* Verify that place we are splicing to is still there and vacant */
+ write_lock(&inode->u.ext2_i.i_meta_lock);
/* Writer: pointers, ->i_next_alloc* */
if (!verify_chain(chain, where-1) || *where->p)
/* Writer: end */
@@ -461,7 +474,7 @@
*where->p = where->key;
inode->u.ext2_i.i_next_alloc_block = block;
inode->u.ext2_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
-
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
/* Writer: end */
/* We are done with atomic stuff, now do the rest of housekeeping */
@@ -484,6 +497,7 @@
return 0;
changed:
+ write_unlock(&inode->u.ext2_i.i_meta_lock);
for (i = 1; i < num; i++)
bforget(where[i].bh);
for (i = 0; i < num; i++)
@@ -517,7 +531,6 @@
if (depth == 0)
goto out;
- lock_kernel();
reread:
partial = ext2_get_branch(inode, depth, offsets, chain, &err);
@@ -539,7 +552,6 @@
brelse(partial->bh);
partial--;
}
- unlock_kernel();
out:
return err;
}
@@ -942,6 +954,7 @@
inode->i_ctime = le32_to_cpu(raw_inode->i_ctime);
inode->i_mtime = le32_to_cpu(raw_inode->i_mtime);
inode->u.ext2_i.i_dtime = le32_to_cpu(raw_inode->i_dtime);
+ rwlock_init(&inode->u.ext2_i.i_meta_lock);
/* We now have enough fields to check if the inode was active or not.
* This is needed because nfsd might try to access dead inodes
* the test is that same one that e2fsck uses
@@ -1149,9 +1162,7 @@
void ext2_write_inode (struct inode * inode, int wait)
{
- lock_kernel();
ext2_update_inode (inode, wait);
- unlock_kernel();
}
int ext2_sync_inode (struct inode *inode)
diff -ur -X 2.5/exclude linux-2.4.19-pre2-clean/include/linux/ext2_fs_i.h linux/include/linux/ext2_fs_i.h
--- linux-2.4.19-pre2-clean/include/linux/ext2_fs_i.h Mon Sep 17 13:16:30 2001
+++ linux/include/linux/ext2_fs_i.h Tue Mar 5 16:57:25 2002
@@ -36,6 +36,7 @@
__u32 i_prealloc_count;
__u32 i_dir_start_lookup;
int i_new_inode:1; /* Is a freshly allocated inode */
+ rwlock_t i_meta_lock;
};
#endif /* _LINUX_EXT2_FS_I */