I've been debugging a hang in jbd2_journal_release_jbd_inode
which is being seen on Power 6 systems quite a lot. When we get
in the hung state, all I/O to the disk in question gets blocked
where we stay indefinitely. Looking at the task list, I can see
we are stuck in jbd2_journal_release_jbd_inode waiting on a
wake up. I added some debug code to detect this scenario and
dump additional data if we were stuck in jbd2_journal_release_jbd_inode
for longer than 30 minutes. When it hit, I was able to see that
i_flags was 0, suggesting we missed the wake up.
This patch changes i_flags to be an unsigned long, uses bit operators
to access it, and adds barriers around the accesses. Prior to applying
this patch, we were regularly hitting this hang on numerous systems
in our test environment. After applying the patch, the hangs no longer
occur. Its still not clear to me why the j_list_lock doesn't protect us
in this path. It also appears a hang very similar to this was seen
in the past and then was no longer recreatable:
http://forum.soft32.com/linux/20090310-ext4-hangs-ftopict478916.html
Signed-off-by: Brian King <[email protected]>
---
fs/jbd2/commit.c | 14 ++++++++++----
fs/jbd2/journal.c | 5 ++++-
include/linux/jbd2.h | 2 +-
3 files changed, 15 insertions(+), 6 deletions(-)
diff -puN include/linux/jbd2.h~jbd2_ji_commit_barrier_patch include/linux/jbd2.h
--- linux-2.6/include/linux/jbd2.h~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
+++ linux-2.6-bjking1/include/linux/jbd2.h 2010-07-07 09:01:12.000000000 -0500
@@ -395,7 +395,7 @@ struct jbd2_inode {
struct inode *i_vfs_inode;
/* Flags of inode [j_list_lock] */
- unsigned int i_flags;
+ unsigned long i_flags;
};
struct jbd2_revoke_table_s;
diff -puN fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch fs/jbd2/commit.c
--- linux-2.6/fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
+++ linux-2.6-bjking1/fs/jbd2/commit.c 2010-07-07 09:01:12.000000000 -0500
@@ -26,7 +26,9 @@
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/bitops.h>
#include <trace/events/jbd2.h>
+#include <asm/system.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -245,7 +247,7 @@ static int journal_submit_data_buffers(j
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
mapping = jinode->i_vfs_inode->i_mapping;
- jinode->i_flags |= JI_COMMIT_RUNNING;
+ set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
spin_unlock(&journal->j_list_lock);
/*
* submit the inode data buffers. We use writepage
@@ -260,7 +262,9 @@ static int journal_submit_data_buffers(j
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
commit_transaction->t_flushed_data_blocks = 1;
- jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ smp_mb__before_clear_bit();
+ clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+ smp_mb__before_clear_bit();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
spin_unlock(&journal->j_list_lock);
@@ -281,7 +285,7 @@ static int journal_finish_inode_data_buf
/* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
- jinode->i_flags |= JI_COMMIT_RUNNING;
+ set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
spin_unlock(&journal->j_list_lock);
err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
if (err) {
@@ -297,7 +301,9 @@ static int journal_finish_inode_data_buf
ret = err;
}
spin_lock(&journal->j_list_lock);
- jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ smp_mb__before_clear_bit();
+ clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+ smp_mb__before_clear_bit();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
diff -puN fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch fs/jbd2/journal.c
--- linux-2.6/fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
+++ linux-2.6-bjking1/fs/jbd2/journal.c 2010-07-07 09:01:12.000000000 -0500
@@ -41,12 +41,14 @@
#include <linux/hash.h>
#include <linux/log2.h>
#include <linux/vmalloc.h>
+#include <linux/bitops.h>
#define CREATE_TRACE_POINTS
#include <trace/events/jbd2.h>
#include <asm/uaccess.h>
#include <asm/page.h>
+#include <asm/system.h>
EXPORT_SYMBOL(jbd2_journal_start);
EXPORT_SYMBOL(jbd2_journal_restart);
@@ -2209,9 +2211,10 @@ void jbd2_journal_release_jbd_inode(jour
restart:
spin_lock(&journal->j_list_lock);
/* Is commit writing out inode - we have to wait */
- if (jinode->i_flags & JI_COMMIT_RUNNING) {
+ if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+ smp_mb();
wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
spin_unlock(&journal->j_list_lock);
_
Brian King wrote:
> I've been debugging a hang in jbd2_journal_release_jbd_inode
> which is being seen on Power 6 systems quite a lot. When we get
> in the hung state, all I/O to the disk in question gets blocked
> where we stay indefinitely. Looking at the task list, I can see
> we are stuck in jbd2_journal_release_jbd_inode waiting on a
> wake up. I added some debug code to detect this scenario and
> dump additional data if we were stuck in jbd2_journal_release_jbd_inode
> for longer than 30 minutes. When it hit, I was able to see that
> i_flags was 0, suggesting we missed the wake up.
>
> This patch changes i_flags to be an unsigned long, uses bit operators
Did you happen to try just changing the jinode i_flags to be an unsigned long?
The current unsigned int seems to be the wrong thing to use with
bit_waitqueue() and friends, does that change alone fix it?
-Eric
> to access it, and adds barriers around the accesses. Prior to applying
> this patch, we were regularly hitting this hang on numerous systems
> in our test environment. After applying the patch, the hangs no longer
> occur. Its still not clear to me why the j_list_lock doesn't protect us
> in this path. It also appears a hang very similar to this was seen
> in the past and then was no longer recreatable:
>
> http://forum.soft32.com/linux/20090310-ext4-hangs-ftopict478916.html
>
> Signed-off-by: Brian King <[email protected]>
> ---
>
> fs/jbd2/commit.c | 14 ++++++++++----
> fs/jbd2/journal.c | 5 ++++-
> include/linux/jbd2.h | 2 +-
> 3 files changed, 15 insertions(+), 6 deletions(-)
>
> diff -puN include/linux/jbd2.h~jbd2_ji_commit_barrier_patch include/linux/jbd2.h
> --- linux-2.6/include/linux/jbd2.h~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
> +++ linux-2.6-bjking1/include/linux/jbd2.h 2010-07-07 09:01:12.000000000 -0500
> @@ -395,7 +395,7 @@ struct jbd2_inode {
> struct inode *i_vfs_inode;
>
> /* Flags of inode [j_list_lock] */
> - unsigned int i_flags;
> + unsigned long i_flags;
> };
>
> struct jbd2_revoke_table_s;
> diff -puN fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch fs/jbd2/commit.c
> --- linux-2.6/fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
> +++ linux-2.6-bjking1/fs/jbd2/commit.c 2010-07-07 09:01:12.000000000 -0500
> @@ -26,7 +26,9 @@
> #include <linux/backing-dev.h>
> #include <linux/bio.h>
> #include <linux/blkdev.h>
> +#include <linux/bitops.h>
> #include <trace/events/jbd2.h>
> +#include <asm/system.h>
>
> /*
> * Default IO end handler for temporary BJ_IO buffer_heads.
> @@ -245,7 +247,7 @@ static int journal_submit_data_buffers(j
> spin_lock(&journal->j_list_lock);
> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
> mapping = jinode->i_vfs_inode->i_mapping;
> - jinode->i_flags |= JI_COMMIT_RUNNING;
> + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
> spin_unlock(&journal->j_list_lock);
> /*
> * submit the inode data buffers. We use writepage
> @@ -260,7 +262,9 @@ static int journal_submit_data_buffers(j
> spin_lock(&journal->j_list_lock);
> J_ASSERT(jinode->i_transaction == commit_transaction);
> commit_transaction->t_flushed_data_blocks = 1;
> - jinode->i_flags &= ~JI_COMMIT_RUNNING;
> + smp_mb__before_clear_bit();
> + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
> + smp_mb__before_clear_bit();
> wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
> }
> spin_unlock(&journal->j_list_lock);
> @@ -281,7 +285,7 @@ static int journal_finish_inode_data_buf
> /* For locking, see the comment in journal_submit_data_buffers() */
> spin_lock(&journal->j_list_lock);
> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
> - jinode->i_flags |= JI_COMMIT_RUNNING;
> + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
> spin_unlock(&journal->j_list_lock);
> err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
> if (err) {
> @@ -297,7 +301,9 @@ static int journal_finish_inode_data_buf
> ret = err;
> }
> spin_lock(&journal->j_list_lock);
> - jinode->i_flags &= ~JI_COMMIT_RUNNING;
> + smp_mb__before_clear_bit();
> + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
> + smp_mb__before_clear_bit();
> wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
> }
>
> diff -puN fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch fs/jbd2/journal.c
> --- linux-2.6/fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
> +++ linux-2.6-bjking1/fs/jbd2/journal.c 2010-07-07 09:01:12.000000000 -0500
> @@ -41,12 +41,14 @@
> #include <linux/hash.h>
> #include <linux/log2.h>
> #include <linux/vmalloc.h>
> +#include <linux/bitops.h>
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/jbd2.h>
>
> #include <asm/uaccess.h>
> #include <asm/page.h>
> +#include <asm/system.h>
>
> EXPORT_SYMBOL(jbd2_journal_start);
> EXPORT_SYMBOL(jbd2_journal_restart);
> @@ -2209,9 +2211,10 @@ void jbd2_journal_release_jbd_inode(jour
> restart:
> spin_lock(&journal->j_list_lock);
> /* Is commit writing out inode - we have to wait */
> - if (jinode->i_flags & JI_COMMIT_RUNNING) {
> + if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
> wait_queue_head_t *wq;
> DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
> + smp_mb();
> wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
> prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
> spin_unlock(&journal->j_list_lock);
> _
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On 07/14/2010 11:32 AM, Eric Sandeen wrote:
> Brian King wrote:
>> I've been debugging a hang in jbd2_journal_release_jbd_inode
>> which is being seen on Power 6 systems quite a lot. When we get
>> in the hung state, all I/O to the disk in question gets blocked
>> where we stay indefinitely. Looking at the task list, I can see
>> we are stuck in jbd2_journal_release_jbd_inode waiting on a
>> wake up. I added some debug code to detect this scenario and
>> dump additional data if we were stuck in jbd2_journal_release_jbd_inode
>> for longer than 30 minutes. When it hit, I was able to see that
>> i_flags was 0, suggesting we missed the wake up.
>>
>> This patch changes i_flags to be an unsigned long, uses bit operators
>
> Did you happen to try just changing the jinode i_flags to be an unsigned long?
> The current unsigned int seems to be the wrong thing to use with
> bit_waitqueue() and friends, does that change alone fix it?
I am testing a patch without the added barriers, but don't have too much
runtime on it yet. I'll spin a kernel with just the unsigned long change
and see how that holds up.
Thanks,
Brian
>
> -Eric
>
>> to access it, and adds barriers around the accesses. Prior to applying
>> this patch, we were regularly hitting this hang on numerous systems
>> in our test environment. After applying the patch, the hangs no longer
>> occur. Its still not clear to me why the j_list_lock doesn't protect us
>> in this path. It also appears a hang very similar to this was seen
>> in the past and then was no longer recreatable:
>>
>> http://forum.soft32.com/linux/20090310-ext4-hangs-ftopict478916.html
>>
>> Signed-off-by: Brian King <[email protected]>
>> ---
>>
>> fs/jbd2/commit.c | 14 ++++++++++----
>> fs/jbd2/journal.c | 5 ++++-
>> include/linux/jbd2.h | 2 +-
>> 3 files changed, 15 insertions(+), 6 deletions(-)
>>
>> diff -puN include/linux/jbd2.h~jbd2_ji_commit_barrier_patch include/linux/jbd2.h
>> --- linux-2.6/include/linux/jbd2.h~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
>> +++ linux-2.6-bjking1/include/linux/jbd2.h 2010-07-07 09:01:12.000000000 -0500
>> @@ -395,7 +395,7 @@ struct jbd2_inode {
>> struct inode *i_vfs_inode;
>>
>> /* Flags of inode [j_list_lock] */
>> - unsigned int i_flags;
>> + unsigned long i_flags;
>> };
>>
>> struct jbd2_revoke_table_s;
>> diff -puN fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch fs/jbd2/commit.c
>> --- linux-2.6/fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
>> +++ linux-2.6-bjking1/fs/jbd2/commit.c 2010-07-07 09:01:12.000000000 -0500
>> @@ -26,7 +26,9 @@
>> #include <linux/backing-dev.h>
>> #include <linux/bio.h>
>> #include <linux/blkdev.h>
>> +#include <linux/bitops.h>
>> #include <trace/events/jbd2.h>
>> +#include <asm/system.h>
>>
>> /*
>> * Default IO end handler for temporary BJ_IO buffer_heads.
>> @@ -245,7 +247,7 @@ static int journal_submit_data_buffers(j
>> spin_lock(&journal->j_list_lock);
>> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
>> mapping = jinode->i_vfs_inode->i_mapping;
>> - jinode->i_flags |= JI_COMMIT_RUNNING;
>> + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> spin_unlock(&journal->j_list_lock);
>> /*
>> * submit the inode data buffers. We use writepage
>> @@ -260,7 +262,9 @@ static int journal_submit_data_buffers(j
>> spin_lock(&journal->j_list_lock);
>> J_ASSERT(jinode->i_transaction == commit_transaction);
>> commit_transaction->t_flushed_data_blocks = 1;
>> - jinode->i_flags &= ~JI_COMMIT_RUNNING;
>> + smp_mb__before_clear_bit();
>> + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> + smp_mb__before_clear_bit();
>> wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
>> }
>> spin_unlock(&journal->j_list_lock);
>> @@ -281,7 +285,7 @@ static int journal_finish_inode_data_buf
>> /* For locking, see the comment in journal_submit_data_buffers() */
>> spin_lock(&journal->j_list_lock);
>> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
>> - jinode->i_flags |= JI_COMMIT_RUNNING;
>> + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> spin_unlock(&journal->j_list_lock);
>> err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
>> if (err) {
>> @@ -297,7 +301,9 @@ static int journal_finish_inode_data_buf
>> ret = err;
>> }
>> spin_lock(&journal->j_list_lock);
>> - jinode->i_flags &= ~JI_COMMIT_RUNNING;
>> + smp_mb__before_clear_bit();
>> + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> + smp_mb__before_clear_bit();
>> wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
>> }
>>
>> diff -puN fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch fs/jbd2/journal.c
>> --- linux-2.6/fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
>> +++ linux-2.6-bjking1/fs/jbd2/journal.c 2010-07-07 09:01:12.000000000 -0500
>> @@ -41,12 +41,14 @@
>> #include <linux/hash.h>
>> #include <linux/log2.h>
>> #include <linux/vmalloc.h>
>> +#include <linux/bitops.h>
>>
>> #define CREATE_TRACE_POINTS
>> #include <trace/events/jbd2.h>
>>
>> #include <asm/uaccess.h>
>> #include <asm/page.h>
>> +#include <asm/system.h>
>>
>> EXPORT_SYMBOL(jbd2_journal_start);
>> EXPORT_SYMBOL(jbd2_journal_restart);
>> @@ -2209,9 +2211,10 @@ void jbd2_journal_release_jbd_inode(jour
>> restart:
>> spin_lock(&journal->j_list_lock);
>> /* Is commit writing out inode - we have to wait */
>> - if (jinode->i_flags & JI_COMMIT_RUNNING) {
>> + if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
>> wait_queue_head_t *wq;
>> DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
>> + smp_mb();
>> wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
>> prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
>> spin_unlock(&journal->j_list_lock);
>> _
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
--
Brian King
Linux on Power Virtualization
IBM Linux Technology Center
Eric Sandeen wrote:
> Brian King wrote:
>> I've been debugging a hang in jbd2_journal_release_jbd_inode
>> which is being seen on Power 6 systems quite a lot. When we get
>> in the hung state, all I/O to the disk in question gets blocked
>> where we stay indefinitely. Looking at the task list, I can see
>> we are stuck in jbd2_journal_release_jbd_inode waiting on a
>> wake up. I added some debug code to detect this scenario and
>> dump additional data if we were stuck in jbd2_journal_release_jbd_inode
>> for longer than 30 minutes. When it hit, I was able to see that
>> i_flags was 0, suggesting we missed the wake up.
>>
>> This patch changes i_flags to be an unsigned long, uses bit operators
>
> Did you happen to try just changing the jinode i_flags to be an unsigned long?
> The current unsigned int seems to be the wrong thing to use with
> bit_waitqueue() and friends, does that change alone fix it?
I guess I spoke too soon (not that familiar w/ the bit waitqueue stuff TBH)
The wake_up_bit() comments are relevant I guess:
* In order for this to function properly, as it uses waitqueue_active()
* internally, some kind of memory barrier must be done prior to calling
* this. Typically, this will be smp_mb__after_clear_bit(), but in some
* cases where bitflags are manipulated non-atomically under a lock, one
* may need to use a less regular barrier, such fs/inode.c's smp_mb(),
* because spin_unlock() does not guarantee a memory barrier.
So I think the patch makes sense to me but more comments from someone
more familiar with these semantics would be great. :)
Thanks,
-Eric
> -Eric
>
>> to access it, and adds barriers around the accesses. Prior to applying
>> this patch, we were regularly hitting this hang on numerous systems
>> in our test environment. After applying the patch, the hangs no longer
>> occur. Its still not clear to me why the j_list_lock doesn't protect us
>> in this path. It also appears a hang very similar to this was seen
>> in the past and then was no longer recreatable:
>>
>> http://forum.soft32.com/linux/20090310-ext4-hangs-ftopict478916.html
>>
>> Signed-off-by: Brian King <[email protected]>
>> ---
>>
>> fs/jbd2/commit.c | 14 ++++++++++----
>> fs/jbd2/journal.c | 5 ++++-
>> include/linux/jbd2.h | 2 +-
>> 3 files changed, 15 insertions(+), 6 deletions(-)
>>
>> diff -puN include/linux/jbd2.h~jbd2_ji_commit_barrier_patch include/linux/jbd2.h
>> --- linux-2.6/include/linux/jbd2.h~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
>> +++ linux-2.6-bjking1/include/linux/jbd2.h 2010-07-07 09:01:12.000000000 -0500
>> @@ -395,7 +395,7 @@ struct jbd2_inode {
>> struct inode *i_vfs_inode;
>>
>> /* Flags of inode [j_list_lock] */
>> - unsigned int i_flags;
>> + unsigned long i_flags;
>> };
>>
>> struct jbd2_revoke_table_s;
>> diff -puN fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch fs/jbd2/commit.c
>> --- linux-2.6/fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
>> +++ linux-2.6-bjking1/fs/jbd2/commit.c 2010-07-07 09:01:12.000000000 -0500
>> @@ -26,7 +26,9 @@
>> #include <linux/backing-dev.h>
>> #include <linux/bio.h>
>> #include <linux/blkdev.h>
>> +#include <linux/bitops.h>
>> #include <trace/events/jbd2.h>
>> +#include <asm/system.h>
>>
>> /*
>> * Default IO end handler for temporary BJ_IO buffer_heads.
>> @@ -245,7 +247,7 @@ static int journal_submit_data_buffers(j
>> spin_lock(&journal->j_list_lock);
>> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
>> mapping = jinode->i_vfs_inode->i_mapping;
>> - jinode->i_flags |= JI_COMMIT_RUNNING;
>> + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> spin_unlock(&journal->j_list_lock);
>> /*
>> * submit the inode data buffers. We use writepage
>> @@ -260,7 +262,9 @@ static int journal_submit_data_buffers(j
>> spin_lock(&journal->j_list_lock);
>> J_ASSERT(jinode->i_transaction == commit_transaction);
>> commit_transaction->t_flushed_data_blocks = 1;
>> - jinode->i_flags &= ~JI_COMMIT_RUNNING;
>> + smp_mb__before_clear_bit();
>> + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> + smp_mb__before_clear_bit();
>> wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
>> }
>> spin_unlock(&journal->j_list_lock);
>> @@ -281,7 +285,7 @@ static int journal_finish_inode_data_buf
>> /* For locking, see the comment in journal_submit_data_buffers() */
>> spin_lock(&journal->j_list_lock);
>> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
>> - jinode->i_flags |= JI_COMMIT_RUNNING;
>> + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> spin_unlock(&journal->j_list_lock);
>> err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
>> if (err) {
>> @@ -297,7 +301,9 @@ static int journal_finish_inode_data_buf
>> ret = err;
>> }
>> spin_lock(&journal->j_list_lock);
>> - jinode->i_flags &= ~JI_COMMIT_RUNNING;
>> + smp_mb__before_clear_bit();
>> + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> + smp_mb__before_clear_bit();
>> wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
>> }
>>
>> diff -puN fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch fs/jbd2/journal.c
>> --- linux-2.6/fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
>> +++ linux-2.6-bjking1/fs/jbd2/journal.c 2010-07-07 09:01:12.000000000 -0500
>> @@ -41,12 +41,14 @@
>> #include <linux/hash.h>
>> #include <linux/log2.h>
>> #include <linux/vmalloc.h>
>> +#include <linux/bitops.h>
>>
>> #define CREATE_TRACE_POINTS
>> #include <trace/events/jbd2.h>
>>
>> #include <asm/uaccess.h>
>> #include <asm/page.h>
>> +#include <asm/system.h>
>>
>> EXPORT_SYMBOL(jbd2_journal_start);
>> EXPORT_SYMBOL(jbd2_journal_restart);
>> @@ -2209,9 +2211,10 @@ void jbd2_journal_release_jbd_inode(jour
>> restart:
>> spin_lock(&journal->j_list_lock);
>> /* Is commit writing out inode - we have to wait */
>> - if (jinode->i_flags & JI_COMMIT_RUNNING) {
>> + if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
>> wait_queue_head_t *wq;
>> DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
>> + smp_mb();
>> wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
>> prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
>> spin_unlock(&journal->j_list_lock);
>> _
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jul 14, 2010 at 09:56:15AM -0500, Brian King wrote:
>
> I've been debugging a hang in jbd2_journal_release_jbd_inode
> which is being seen on Power 6 systems quite a lot. When we get
> in the hung state, all I/O to the disk in question gets blocked
> where we stay indefinitely. Looking at the task list, I can see
> we are stuck in jbd2_journal_release_jbd_inode waiting on a
> wake up. I added some debug code to detect this scenario and
> dump additional data if we were stuck in jbd2_journal_release_jbd_inode
> for longer than 30 minutes. When it hit, I was able to see that
> i_flags was 0, suggesting we missed the wake up.
>
> This patch changes i_flags to be an unsigned long, uses bit operators
> to access it, and adds barriers around the accesses. Prior to applying
> this patch, we were regularly hitting this hang on numerous systems
> in our test environment. After applying the patch, the hangs no longer
> occur. Its still not clear to me why the j_list_lock doesn't protect us
> in this path. It also appears a hang very similar to this was seen
> in the past and then was no longer recreatable:
>
> http://forum.soft32.com/linux/20090310-ext4-hangs-ftopict478916.html
>
> Signed-off-by: Brian King <[email protected]>
> ---
>
> fs/jbd2/commit.c | 14 ++++++++++----
> fs/jbd2/journal.c | 5 ++++-
> include/linux/jbd2.h | 2 +-
> 3 files changed, 15 insertions(+), 6 deletions(-)
>
> diff -puN include/linux/jbd2.h~jbd2_ji_commit_barrier_patch include/linux/jbd2.h
> --- linux-2.6/include/linux/jbd2.h~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
> +++ linux-2.6-bjking1/include/linux/jbd2.h 2010-07-07 09:01:12.000000000 -0500
> @@ -395,7 +395,7 @@ struct jbd2_inode {
> struct inode *i_vfs_inode;
>
> /* Flags of inode [j_list_lock] */
> - unsigned int i_flags;
> + unsigned long i_flags;
> };
>
> struct jbd2_revoke_table_s;
> diff -puN fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch fs/jbd2/commit.c
> --- linux-2.6/fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch 2010-07-07 09:01:12.000000000 -0500
> +++ linux-2.6-bjking1/fs/jbd2/commit.c 2010-07-07 09:01:12.000000000 -0500
> @@ -26,7 +26,9 @@
> #include <linux/backing-dev.h>
> #include <linux/bio.h>
> #include <linux/blkdev.h>
> +#include <linux/bitops.h>
> #include <trace/events/jbd2.h>
> +#include <asm/system.h>
>
> /*
> * Default IO end handler for temporary BJ_IO buffer_heads.
> @@ -245,7 +247,7 @@ static int journal_submit_data_buffers(j
> spin_lock(&journal->j_list_lock);
> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
> mapping = jinode->i_vfs_inode->i_mapping;
> - jinode->i_flags |= JI_COMMIT_RUNNING;
> + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
> spin_unlock(&journal->j_list_lock);
> /*
> * submit the inode data buffers. We use writepage
> @@ -260,7 +262,9 @@ static int journal_submit_data_buffers(j
> spin_lock(&journal->j_list_lock);
> J_ASSERT(jinode->i_transaction == commit_transaction);
> commit_transaction->t_flushed_data_blocks = 1;
> - jinode->i_flags &= ~JI_COMMIT_RUNNING;
> + smp_mb__before_clear_bit();
> + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
> + smp_mb__before_clear_bit();
> wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
> }
This seems to be a bit overkill, you could probably just get away with
clear_bit()
smp_mb__after_clear_bit
That should be sufficient. Other than that, it seems good. Thanks,
Josef
I've been debugging a hang in jbd2_journal_release_jbd_inode
which is being seen on Power 6 systems quite a lot. When we get
in the hung state, all I/O to the disk in question gets blocked
where we stay indefinitely. Looking at the task list, I can see
we are stuck in jbd2_journal_release_jbd_inode waiting on a
wake up. I added some debug code to detect this scenario and
dump additional data if we were stuck in jbd2_journal_release_jbd_inode
for longer than 30 minutes. When it hit, I was able to see that
i_flags was 0, suggesting we missed the wake up.
This patch changes i_flags to be an unsigned long, uses bit operators
to access it, and adds barriers around the accesses. Prior to applying
this patch, we were regularly hitting this hang on numerous systems
in our test environment. After applying the patch, the hangs no longer
occur. Its still not clear to me why the j_list_lock doesn't protect us
in this path. It also appears a hang very similar to this was seen
in the past and then was no longer recreatable:
http://forum.soft32.com/linux/20090310-ext4-hangs-ftopict478916.html
Signed-off-by: Brian King <[email protected]>
---
fs/jbd2/commit.c | 12 ++++++++----
fs/jbd2/journal.c | 5 ++++-
include/linux/jbd2.h | 2 +-
3 files changed, 13 insertions(+), 6 deletions(-)
diff -puN include/linux/jbd2.h~jbd2_ji_commit_barrier_patch include/linux/jbd2.h
--- linux-2.6/include/linux/jbd2.h~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
+++ linux-2.6-bjking1/include/linux/jbd2.h 2010-07-14 13:46:17.000000000 -0500
@@ -395,7 +395,7 @@ struct jbd2_inode {
struct inode *i_vfs_inode;
/* Flags of inode [j_list_lock] */
- unsigned int i_flags;
+ unsigned long i_flags;
};
struct jbd2_revoke_table_s;
diff -puN fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch fs/jbd2/commit.c
--- linux-2.6/fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
+++ linux-2.6-bjking1/fs/jbd2/commit.c 2010-07-14 13:56:27.000000000 -0500
@@ -26,7 +26,9 @@
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/bitops.h>
#include <trace/events/jbd2.h>
+#include <asm/system.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -245,7 +247,7 @@ static int journal_submit_data_buffers(j
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
mapping = jinode->i_vfs_inode->i_mapping;
- jinode->i_flags |= JI_COMMIT_RUNNING;
+ set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
spin_unlock(&journal->j_list_lock);
/*
* submit the inode data buffers. We use writepage
@@ -260,7 +262,8 @@ static int journal_submit_data_buffers(j
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
commit_transaction->t_flushed_data_blocks = 1;
- jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+ smp_mb__after_clear_bit();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
spin_unlock(&journal->j_list_lock);
@@ -281,7 +284,7 @@ static int journal_finish_inode_data_buf
/* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
- jinode->i_flags |= JI_COMMIT_RUNNING;
+ set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
spin_unlock(&journal->j_list_lock);
err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
if (err) {
@@ -297,7 +300,8 @@ static int journal_finish_inode_data_buf
ret = err;
}
spin_lock(&journal->j_list_lock);
- jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+ smp_mb__after_clear_bit();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
diff -puN fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch fs/jbd2/journal.c
--- linux-2.6/fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
+++ linux-2.6-bjking1/fs/jbd2/journal.c 2010-07-14 13:46:17.000000000 -0500
@@ -41,12 +41,14 @@
#include <linux/hash.h>
#include <linux/log2.h>
#include <linux/vmalloc.h>
+#include <linux/bitops.h>
#define CREATE_TRACE_POINTS
#include <trace/events/jbd2.h>
#include <asm/uaccess.h>
#include <asm/page.h>
+#include <asm/system.h>
EXPORT_SYMBOL(jbd2_journal_start);
EXPORT_SYMBOL(jbd2_journal_restart);
@@ -2209,9 +2211,10 @@ void jbd2_journal_release_jbd_inode(jour
restart:
spin_lock(&journal->j_list_lock);
/* Is commit writing out inode - we have to wait */
- if (jinode->i_flags & JI_COMMIT_RUNNING) {
+ if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+ smp_mb();
wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
spin_unlock(&journal->j_list_lock);
_
On Wed, Jul 14, 2010 at 01:58:46PM -0500, Brian King wrote:
>
> I've been debugging a hang in jbd2_journal_release_jbd_inode
> which is being seen on Power 6 systems quite a lot. When we get
> in the hung state, all I/O to the disk in question gets blocked
> where we stay indefinitely. Looking at the task list, I can see
> we are stuck in jbd2_journal_release_jbd_inode waiting on a
> wake up. I added some debug code to detect this scenario and
> dump additional data if we were stuck in jbd2_journal_release_jbd_inode
> for longer than 30 minutes. When it hit, I was able to see that
> i_flags was 0, suggesting we missed the wake up.
>
> This patch changes i_flags to be an unsigned long, uses bit operators
> to access it, and adds barriers around the accesses. Prior to applying
> this patch, we were regularly hitting this hang on numerous systems
> in our test environment. After applying the patch, the hangs no longer
> occur. Its still not clear to me why the j_list_lock doesn't protect us
> in this path. It also appears a hang very similar to this was seen
> in the past and then was no longer recreatable:
>
> http://forum.soft32.com/linux/20090310-ext4-hangs-ftopict478916.html
>
> Signed-off-by: Brian King <[email protected]>
> ---
>
> fs/jbd2/commit.c | 12 ++++++++----
> fs/jbd2/journal.c | 5 ++++-
> include/linux/jbd2.h | 2 +-
> 3 files changed, 13 insertions(+), 6 deletions(-)
>
> diff -puN include/linux/jbd2.h~jbd2_ji_commit_barrier_patch include/linux/jbd2.h
> --- linux-2.6/include/linux/jbd2.h~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
> +++ linux-2.6-bjking1/include/linux/jbd2.h 2010-07-14 13:46:17.000000000 -0500
> @@ -395,7 +395,7 @@ struct jbd2_inode {
> struct inode *i_vfs_inode;
>
> /* Flags of inode [j_list_lock] */
> - unsigned int i_flags;
> + unsigned long i_flags;
> };
>
> struct jbd2_revoke_table_s;
> diff -puN fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch fs/jbd2/commit.c
> --- linux-2.6/fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
> +++ linux-2.6-bjking1/fs/jbd2/commit.c 2010-07-14 13:56:27.000000000 -0500
> @@ -26,7 +26,9 @@
> #include <linux/backing-dev.h>
> #include <linux/bio.h>
> #include <linux/blkdev.h>
> +#include <linux/bitops.h>
> #include <trace/events/jbd2.h>
> +#include <asm/system.h>
>
> /*
> * Default IO end handler for temporary BJ_IO buffer_heads.
> @@ -245,7 +247,7 @@ static int journal_submit_data_buffers(j
> spin_lock(&journal->j_list_lock);
> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
> mapping = jinode->i_vfs_inode->i_mapping;
> - jinode->i_flags |= JI_COMMIT_RUNNING;
> + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
> spin_unlock(&journal->j_list_lock);
> /*
> * submit the inode data buffers. We use writepage
> @@ -260,7 +262,8 @@ static int journal_submit_data_buffers(j
> spin_lock(&journal->j_list_lock);
> J_ASSERT(jinode->i_transaction == commit_transaction);
> commit_transaction->t_flushed_data_blocks = 1;
> - jinode->i_flags &= ~JI_COMMIT_RUNNING;
> + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
> + smp_mb__after_clear_bit();
> wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
> }
> spin_unlock(&journal->j_list_lock);
> @@ -281,7 +284,7 @@ static int journal_finish_inode_data_buf
> /* For locking, see the comment in journal_submit_data_buffers() */
> spin_lock(&journal->j_list_lock);
> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
> - jinode->i_flags |= JI_COMMIT_RUNNING;
> + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
> spin_unlock(&journal->j_list_lock);
> err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
> if (err) {
> @@ -297,7 +300,8 @@ static int journal_finish_inode_data_buf
> ret = err;
> }
> spin_lock(&journal->j_list_lock);
> - jinode->i_flags &= ~JI_COMMIT_RUNNING;
> + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
> + smp_mb__after_clear_bit();
> wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
> }
>
> diff -puN fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch fs/jbd2/journal.c
> --- linux-2.6/fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
> +++ linux-2.6-bjking1/fs/jbd2/journal.c 2010-07-14 13:46:17.000000000 -0500
> @@ -41,12 +41,14 @@
> #include <linux/hash.h>
> #include <linux/log2.h>
> #include <linux/vmalloc.h>
> +#include <linux/bitops.h>
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/jbd2.h>
>
> #include <asm/uaccess.h>
> #include <asm/page.h>
> +#include <asm/system.h>
>
> EXPORT_SYMBOL(jbd2_journal_start);
> EXPORT_SYMBOL(jbd2_journal_restart);
> @@ -2209,9 +2211,10 @@ void jbd2_journal_release_jbd_inode(jour
> restart:
> spin_lock(&journal->j_list_lock);
> /* Is commit writing out inode - we have to wait */
> - if (jinode->i_flags & JI_COMMIT_RUNNING) {
> + if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
> wait_queue_head_t *wq;
> DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
> + smp_mb();
> wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
> prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
> spin_unlock(&journal->j_list_lock);
> _
Seems reasonable to me, I assume you re-tested with this patch to make sure it
still fixes the problem? You can add
Reviewed-by: Josef Bacik <[email protected]>
Thanks,
Josef
On 07/14/2010 02:05 PM, Josef Bacik wrote:
> On Wed, Jul 14, 2010 at 01:58:46PM -0500, Brian King wrote:
>>
>> I've been debugging a hang in jbd2_journal_release_jbd_inode
>> which is being seen on Power 6 systems quite a lot. When we get
>> in the hung state, all I/O to the disk in question gets blocked
>> where we stay indefinitely. Looking at the task list, I can see
>> we are stuck in jbd2_journal_release_jbd_inode waiting on a
>> wake up. I added some debug code to detect this scenario and
>> dump additional data if we were stuck in jbd2_journal_release_jbd_inode
>> for longer than 30 minutes. When it hit, I was able to see that
>> i_flags was 0, suggesting we missed the wake up.
>>
>> This patch changes i_flags to be an unsigned long, uses bit operators
>> to access it, and adds barriers around the accesses. Prior to applying
>> this patch, we were regularly hitting this hang on numerous systems
>> in our test environment. After applying the patch, the hangs no longer
>> occur. Its still not clear to me why the j_list_lock doesn't protect us
>> in this path. It also appears a hang very similar to this was seen
>> in the past and then was no longer recreatable:
>>
>> http://forum.soft32.com/linux/20090310-ext4-hangs-ftopict478916.html
>>
>> Signed-off-by: Brian King <[email protected]>
>> ---
>>
>> fs/jbd2/commit.c | 12 ++++++++----
>> fs/jbd2/journal.c | 5 ++++-
>> include/linux/jbd2.h | 2 +-
>> 3 files changed, 13 insertions(+), 6 deletions(-)
>>
>> diff -puN include/linux/jbd2.h~jbd2_ji_commit_barrier_patch include/linux/jbd2.h
>> --- linux-2.6/include/linux/jbd2.h~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
>> +++ linux-2.6-bjking1/include/linux/jbd2.h 2010-07-14 13:46:17.000000000 -0500
>> @@ -395,7 +395,7 @@ struct jbd2_inode {
>> struct inode *i_vfs_inode;
>>
>> /* Flags of inode [j_list_lock] */
>> - unsigned int i_flags;
>> + unsigned long i_flags;
>> };
>>
>> struct jbd2_revoke_table_s;
>> diff -puN fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch fs/jbd2/commit.c
>> --- linux-2.6/fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
>> +++ linux-2.6-bjking1/fs/jbd2/commit.c 2010-07-14 13:56:27.000000000 -0500
>> @@ -26,7 +26,9 @@
>> #include <linux/backing-dev.h>
>> #include <linux/bio.h>
>> #include <linux/blkdev.h>
>> +#include <linux/bitops.h>
>> #include <trace/events/jbd2.h>
>> +#include <asm/system.h>
>>
>> /*
>> * Default IO end handler for temporary BJ_IO buffer_heads.
>> @@ -245,7 +247,7 @@ static int journal_submit_data_buffers(j
>> spin_lock(&journal->j_list_lock);
>> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
>> mapping = jinode->i_vfs_inode->i_mapping;
>> - jinode->i_flags |= JI_COMMIT_RUNNING;
>> + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> spin_unlock(&journal->j_list_lock);
>> /*
>> * submit the inode data buffers. We use writepage
>> @@ -260,7 +262,8 @@ static int journal_submit_data_buffers(j
>> spin_lock(&journal->j_list_lock);
>> J_ASSERT(jinode->i_transaction == commit_transaction);
>> commit_transaction->t_flushed_data_blocks = 1;
>> - jinode->i_flags &= ~JI_COMMIT_RUNNING;
>> + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> + smp_mb__after_clear_bit();
>> wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
>> }
>> spin_unlock(&journal->j_list_lock);
>> @@ -281,7 +284,7 @@ static int journal_finish_inode_data_buf
>> /* For locking, see the comment in journal_submit_data_buffers() */
>> spin_lock(&journal->j_list_lock);
>> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
>> - jinode->i_flags |= JI_COMMIT_RUNNING;
>> + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> spin_unlock(&journal->j_list_lock);
>> err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
>> if (err) {
>> @@ -297,7 +300,8 @@ static int journal_finish_inode_data_buf
>> ret = err;
>> }
>> spin_lock(&journal->j_list_lock);
>> - jinode->i_flags &= ~JI_COMMIT_RUNNING;
>> + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> + smp_mb__after_clear_bit();
>> wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
>> }
>>
>> diff -puN fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch fs/jbd2/journal.c
>> --- linux-2.6/fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
>> +++ linux-2.6-bjking1/fs/jbd2/journal.c 2010-07-14 13:46:17.000000000 -0500
>> @@ -41,12 +41,14 @@
>> #include <linux/hash.h>
>> #include <linux/log2.h>
>> #include <linux/vmalloc.h>
>> +#include <linux/bitops.h>
>>
>> #define CREATE_TRACE_POINTS
>> #include <trace/events/jbd2.h>
>>
>> #include <asm/uaccess.h>
>> #include <asm/page.h>
>> +#include <asm/system.h>
>>
>> EXPORT_SYMBOL(jbd2_journal_start);
>> EXPORT_SYMBOL(jbd2_journal_restart);
>> @@ -2209,9 +2211,10 @@ void jbd2_journal_release_jbd_inode(jour
>> restart:
>> spin_lock(&journal->j_list_lock);
>> /* Is commit writing out inode - we have to wait */
>> - if (jinode->i_flags & JI_COMMIT_RUNNING) {
>> + if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
>> wait_queue_head_t *wq;
>> DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
>> + smp_mb();
>> wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
>> prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
>> spin_unlock(&journal->j_list_lock);
>> _
>
> Seems reasonable to me, I assume you re-tested with this patch to make sure it
> still fixes the problem? You can add
I'm building a kernel with the updated patch and we will load it up on
the failing systems and retest. It usually takes a number of hours
before the problem hits on our test systems, so I may have some early
results tomorrow.
Thanks,
Brian
--
Brian King
Linux on Power Virtualization
IBM Linux Technology Center
On 07/14/2010 02:05 PM, Josef Bacik wrote:
> On Wed, Jul 14, 2010 at 01:58:46PM -0500, Brian King wrote:
>>
>> I've been debugging a hang in jbd2_journal_release_jbd_inode
>> which is being seen on Power 6 systems quite a lot. When we get
>> in the hung state, all I/O to the disk in question gets blocked
>> where we stay indefinitely. Looking at the task list, I can see
>> we are stuck in jbd2_journal_release_jbd_inode waiting on a
>> wake up. I added some debug code to detect this scenario and
>> dump additional data if we were stuck in jbd2_journal_release_jbd_inode
>> for longer than 30 minutes. When it hit, I was able to see that
>> i_flags was 0, suggesting we missed the wake up.
>>
>> This patch changes i_flags to be an unsigned long, uses bit operators
>> to access it, and adds barriers around the accesses. Prior to applying
>> this patch, we were regularly hitting this hang on numerous systems
>> in our test environment. After applying the patch, the hangs no longer
>> occur. Its still not clear to me why the j_list_lock doesn't protect us
>> in this path. It also appears a hang very similar to this was seen
>> in the past and then was no longer recreatable:
>>
>> http://forum.soft32.com/linux/20090310-ext4-hangs-ftopict478916.html
>>
>> Signed-off-by: Brian King <[email protected]>
>> ---
>>
>> fs/jbd2/commit.c | 12 ++++++++----
>> fs/jbd2/journal.c | 5 ++++-
>> include/linux/jbd2.h | 2 +-
>> 3 files changed, 13 insertions(+), 6 deletions(-)
>>
>> diff -puN include/linux/jbd2.h~jbd2_ji_commit_barrier_patch include/linux/jbd2.h
>> --- linux-2.6/include/linux/jbd2.h~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
>> +++ linux-2.6-bjking1/include/linux/jbd2.h 2010-07-14 13:46:17.000000000 -0500
>> @@ -395,7 +395,7 @@ struct jbd2_inode {
>> struct inode *i_vfs_inode;
>>
>> /* Flags of inode [j_list_lock] */
>> - unsigned int i_flags;
>> + unsigned long i_flags;
>> };
>>
>> struct jbd2_revoke_table_s;
>> diff -puN fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch fs/jbd2/commit.c
>> --- linux-2.6/fs/jbd2/commit.c~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
>> +++ linux-2.6-bjking1/fs/jbd2/commit.c 2010-07-14 13:56:27.000000000 -0500
>> @@ -26,7 +26,9 @@
>> #include <linux/backing-dev.h>
>> #include <linux/bio.h>
>> #include <linux/blkdev.h>
>> +#include <linux/bitops.h>
>> #include <trace/events/jbd2.h>
>> +#include <asm/system.h>
>>
>> /*
>> * Default IO end handler for temporary BJ_IO buffer_heads.
>> @@ -245,7 +247,7 @@ static int journal_submit_data_buffers(j
>> spin_lock(&journal->j_list_lock);
>> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
>> mapping = jinode->i_vfs_inode->i_mapping;
>> - jinode->i_flags |= JI_COMMIT_RUNNING;
>> + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> spin_unlock(&journal->j_list_lock);
>> /*
>> * submit the inode data buffers. We use writepage
>> @@ -260,7 +262,8 @@ static int journal_submit_data_buffers(j
>> spin_lock(&journal->j_list_lock);
>> J_ASSERT(jinode->i_transaction == commit_transaction);
>> commit_transaction->t_flushed_data_blocks = 1;
>> - jinode->i_flags &= ~JI_COMMIT_RUNNING;
>> + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> + smp_mb__after_clear_bit();
>> wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
>> }
>> spin_unlock(&journal->j_list_lock);
>> @@ -281,7 +284,7 @@ static int journal_finish_inode_data_buf
>> /* For locking, see the comment in journal_submit_data_buffers() */
>> spin_lock(&journal->j_list_lock);
>> list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
>> - jinode->i_flags |= JI_COMMIT_RUNNING;
>> + set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> spin_unlock(&journal->j_list_lock);
>> err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
>> if (err) {
>> @@ -297,7 +300,8 @@ static int journal_finish_inode_data_buf
>> ret = err;
>> }
>> spin_lock(&journal->j_list_lock);
>> - jinode->i_flags &= ~JI_COMMIT_RUNNING;
>> + clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
>> + smp_mb__after_clear_bit();
>> wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
>> }
>>
>> diff -puN fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch fs/jbd2/journal.c
>> --- linux-2.6/fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
>> +++ linux-2.6-bjking1/fs/jbd2/journal.c 2010-07-14 13:46:17.000000000 -0500
>> @@ -41,12 +41,14 @@
>> #include <linux/hash.h>
>> #include <linux/log2.h>
>> #include <linux/vmalloc.h>
>> +#include <linux/bitops.h>
>>
>> #define CREATE_TRACE_POINTS
>> #include <trace/events/jbd2.h>
>>
>> #include <asm/uaccess.h>
>> #include <asm/page.h>
>> +#include <asm/system.h>
>>
>> EXPORT_SYMBOL(jbd2_journal_start);
>> EXPORT_SYMBOL(jbd2_journal_restart);
>> @@ -2209,9 +2211,10 @@ void jbd2_journal_release_jbd_inode(jour
>> restart:
>> spin_lock(&journal->j_list_lock);
>> /* Is commit writing out inode - we have to wait */
>> - if (jinode->i_flags & JI_COMMIT_RUNNING) {
>> + if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
>> wait_queue_head_t *wq;
>> DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
>> + smp_mb();
>> wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
>> prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
>> spin_unlock(&journal->j_list_lock);
>> _
>
> Seems reasonable to me, I assume you re-tested with this patch to make sure it
> still fixes the problem? You can add
>
> Reviewed-by: Josef Bacik <[email protected]>
We've now run this updated patch for 72 hours in our stress environment
that was typically hitting the issue within 12 hours.
Thanks,
Brian
--
Brian King
Linux on Power Virtualization
IBM Linux Technology Center
>
> I've been debugging a hang in jbd2_journal_release_jbd_inode
> which is being seen on Power 6 systems quite a lot. When we get
> in the hung state, all I/O to the disk in question gets blocked
> where we stay indefinitely. Looking at the task list, I can see
> we are stuck in jbd2_journal_release_jbd_inode waiting on a
> wake up. I added some debug code to detect this scenario and
> dump additional data if we were stuck in jbd2_journal_release_jbd_inode
> for longer than 30 minutes. When it hit, I was able to see that
> i_flags was 0, suggesting we missed the wake up.
>
> This patch changes i_flags to be an unsigned long, uses bit operators
> to access it, and adds barriers around the accesses. Prior to applying
> this patch, we were regularly hitting this hang on numerous systems
> in our test environment. After applying the patch, the hangs no longer
> occur. Its still not clear to me why the j_list_lock doesn't protect us
> in this path.
Thanks for debugging this! I was thinking hard about how it could happen that
wake_up_bit doesn't wake up the waiter but I haven't found any explanation. All
the waitqueue work seems to be properly wrapped inside the j_list_lock so
even the waitqueue_active check in wake_up_bit should be fine.
I'd really like to understand what in my mind-model of spinlocks etc. is
wrong. So could you maybe run a test with the attached debug patch and
dump 'wait.seen' value in the hung task?
And one more question - if you remove 'waitqueue_active' check from
kernel/wait.c:__wake_up_bit
is the problem still present? Thanks a lot in advance.
Honza
--
Jan Kara <[email protected]>
SuSE CR Labs
> >
> > I've been debugging a hang in jbd2_journal_release_jbd_inode
> > which is being seen on Power 6 systems quite a lot. When we get
> > in the hung state, all I/O to the disk in question gets blocked
> > where we stay indefinitely. Looking at the task list, I can see
> > we are stuck in jbd2_journal_release_jbd_inode waiting on a
> > wake up. I added some debug code to detect this scenario and
> > dump additional data if we were stuck in jbd2_journal_release_jbd_inode
> > for longer than 30 minutes. When it hit, I was able to see that
> > i_flags was 0, suggesting we missed the wake up.
> >
> > This patch changes i_flags to be an unsigned long, uses bit operators
> > to access it, and adds barriers around the accesses. Prior to applying
> > this patch, we were regularly hitting this hang on numerous systems
> > in our test environment. After applying the patch, the hangs no longer
> > occur. Its still not clear to me why the j_list_lock doesn't protect us
> > in this path.
> Thanks for debugging this! I was thinking hard about how it could happen that
> wake_up_bit doesn't wake up the waiter but I haven't found any explanation. All
> the waitqueue work seems to be properly wrapped inside the j_list_lock so
> even the waitqueue_active check in wake_up_bit should be fine.
> I'd really like to understand what in my mind-model of spinlocks etc. is
> wrong. So could you maybe run a test with the attached debug patch and
> dump 'wait.seen' value in the hung task?
> And one more question - if you remove 'waitqueue_active' check from
> kernel/wait.c:__wake_up_bit
> is the problem still present? Thanks a lot in advance.
Oops, the patch was missing 'seen' declaration. Here's an updated version.
--
Jan Kara <[email protected]>
SuSE CR Labs
On 07/21/2010 02:02 PM, Jan Kara wrote:
>>
>> I've been debugging a hang in jbd2_journal_release_jbd_inode
>> which is being seen on Power 6 systems quite a lot. When we get
>> in the hung state, all I/O to the disk in question gets blocked
>> where we stay indefinitely. Looking at the task list, I can see
>> we are stuck in jbd2_journal_release_jbd_inode waiting on a
>> wake up. I added some debug code to detect this scenario and
>> dump additional data if we were stuck in jbd2_journal_release_jbd_inode
>> for longer than 30 minutes. When it hit, I was able to see that
>> i_flags was 0, suggesting we missed the wake up.
>>
>> This patch changes i_flags to be an unsigned long, uses bit operators
>> to access it, and adds barriers around the accesses. Prior to applying
>> this patch, we were regularly hitting this hang on numerous systems
>> in our test environment. After applying the patch, the hangs no longer
>> occur. Its still not clear to me why the j_list_lock doesn't protect us
>> in this path.
> Thanks for debugging this! I was thinking hard about how it could happen that
> wake_up_bit doesn't wake up the waiter but I haven't found any explanation. All
> the waitqueue work seems to be properly wrapped inside the j_list_lock so
> even the waitqueue_active check in wake_up_bit should be fine.
> I'd really like to understand what in my mind-model of spinlocks etc. is
> wrong. So could you maybe run a test with the attached debug patch and
> dump 'wait.seen' value in the hung task?
> And one more question - if you remove 'waitqueue_active' check from
> kernel/wait.c:__wake_up_bit
> is the problem still present? Thanks a lot in advance.
I'll see about getting one of our systems loaded up with this change and see
what happens.
Thanks!
Brian
--
Brian King
Linux on Power Virtualization
IBM Linux Technology Center
On Wed, Jul 14, 2010 at 01:58:46PM -0500, Brian King wrote:
>
> I've been debugging a hang in jbd2_journal_release_jbd_inode
> which is being seen on Power 6 systems quite a lot. When we get
> in the hung state, all I/O to the disk in question gets blocked
> where we stay indefinitely. Looking at the task list, I can see
> we are stuck in jbd2_journal_release_jbd_inode waiting on a
> wake up. I added some debug code to detect this scenario and
> dump additional data if we were stuck in jbd2_journal_release_jbd_inode
> for longer than 30 minutes. When it hit, I was able to see that
> i_flags was 0, suggesting we missed the wake up.
>
> This patch changes i_flags to be an unsigned long, uses bit operators
> to access it, and adds barriers around the accesses. Prior to applying
> this patch, we were regularly hitting this hang on numerous systems
> in our test environment. After applying the patch, the hangs no longer
> occur. Its still not clear to me why the j_list_lock doesn't protect us
> in this path. It also appears a hang very similar to this was seen
> in the past and then was no longer recreatable:
I've been look at this patch, and I can see how converting to bitops
definitely makes sense. I can also see how adding
smp_mb__after_clear_bit() makes sense. However, it's not clear the
smp_mb() call here helps?
> diff -puN fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch fs/jbd2/journal.c
> --- linux-2.6/fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
> +++ linux-2.6-bjking1/fs/jbd2/journal.c 2010-07-14 13:46:17.000000000 -0500
> @@ -2209,9 +2211,10 @@ void jbd2_journal_release_jbd_inode(jour
> restart:
> spin_lock(&journal->j_list_lock);
> /* Is commit writing out inode - we have to wait */
> - if (jinode->i_flags & JI_COMMIT_RUNNING) {
> + if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
> wait_queue_head_t *wq;
> DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
> + smp_mb();
> wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
> prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
> spin_unlock(&journal->j_list_lock);
- Ted
On 08/27/2010 02:10 PM, Ted Ts'o wrote:
> On Wed, Jul 14, 2010 at 01:58:46PM -0500, Brian King wrote:
>>
>> I've been debugging a hang in jbd2_journal_release_jbd_inode
>> which is being seen on Power 6 systems quite a lot. When we get
>> in the hung state, all I/O to the disk in question gets blocked
>> where we stay indefinitely. Looking at the task list, I can see
>> we are stuck in jbd2_journal_release_jbd_inode waiting on a
>> wake up. I added some debug code to detect this scenario and
>> dump additional data if we were stuck in jbd2_journal_release_jbd_inode
>> for longer than 30 minutes. When it hit, I was able to see that
>> i_flags was 0, suggesting we missed the wake up.
>>
>> This patch changes i_flags to be an unsigned long, uses bit operators
>> to access it, and adds barriers around the accesses. Prior to applying
>> this patch, we were regularly hitting this hang on numerous systems
>> in our test environment. After applying the patch, the hangs no longer
>> occur. Its still not clear to me why the j_list_lock doesn't protect us
>> in this path. It also appears a hang very similar to this was seen
>> in the past and then was no longer recreatable:
>
> I've been look at this patch, and I can see how converting to bitops
> definitely makes sense. I can also see how adding
> smp_mb__after_clear_bit() makes sense. However, it's not clear the
> smp_mb() call here helps?
It may not be necessary. I originally added it in order to balance
the test_bit with the clear_bit. I'll check with the folks hitting this
in test and see if I can get access to the failing machine. If so,
I'll pull this out and see if we actually need it or not.
Thanks,
Brian
--
Brian King
Linux on Power Virtualization
IBM Linux Technology Center
On 08/27/2010 02:10 PM, Ted Ts'o wrote:
> On Wed, Jul 14, 2010 at 01:58:46PM -0500, Brian King wrote:
>>
>> I've been debugging a hang in jbd2_journal_release_jbd_inode
>> which is being seen on Power 6 systems quite a lot. When we get
>> in the hung state, all I/O to the disk in question gets blocked
>> where we stay indefinitely. Looking at the task list, I can see
>> we are stuck in jbd2_journal_release_jbd_inode waiting on a
>> wake up. I added some debug code to detect this scenario and
>> dump additional data if we were stuck in jbd2_journal_release_jbd_inode
>> for longer than 30 minutes. When it hit, I was able to see that
>> i_flags was 0, suggesting we missed the wake up.
>>
>> This patch changes i_flags to be an unsigned long, uses bit operators
>> to access it, and adds barriers around the accesses. Prior to applying
>> this patch, we were regularly hitting this hang on numerous systems
>> in our test environment. After applying the patch, the hangs no longer
>> occur. Its still not clear to me why the j_list_lock doesn't protect us
>> in this path. It also appears a hang very similar to this was seen
>> in the past and then was no longer recreatable:
>
> I've been look at this patch, and I can see how converting to bitops
> definitely makes sense. I can also see how adding
> smp_mb__after_clear_bit() makes sense. However, it's not clear the
> smp_mb() call here helps?
I pulled out the smp_mb and we ran clean for 75 hours before stopping the test.
We would generally hit this in less than 36 hours, so I think we can safely
remove this barrier. I'll send out an updated patch.
Thanks,
Brian
>
>> diff -puN fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch fs/jbd2/journal.c
>> --- linux-2.6/fs/jbd2/journal.c~jbd2_ji_commit_barrier_patch 2010-07-14 13:46:17.000000000 -0500
>> +++ linux-2.6-bjking1/fs/jbd2/journal.c 2010-07-14 13:46:17.000000000 -0500
>> @@ -2209,9 +2211,10 @@ void jbd2_journal_release_jbd_inode(jour
>> restart:
>> spin_lock(&journal->j_list_lock);
>> /* Is commit writing out inode - we have to wait */
>> - if (jinode->i_flags & JI_COMMIT_RUNNING) {
>> + if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
>> wait_queue_head_t *wq;
>> DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
>> + smp_mb();
>> wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
>> prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
>> spin_unlock(&journal->j_list_lock);
>
> - Ted
--
Brian King
Linux on Power Virtualization
IBM Linux Technology Center