Hi,
Below is my rewrite of ordered mode in JBD. Now we don't have a list of
data buffers that need syncing on transaction commit but a list of inodes
that need writeout during commit. This brings all sorts of advantages such
as possibility to get rid of journal heads and buffer heads for data
buffers in ordered mode, better ordering of writes on transaction commit,
simplification of some JBD code, no more anonymous pages when truncate of
data being committed happens. The patch has survived some light testing
but it still has some potential of eating your data so beware :) I've run
dbench to see whether we didn't decrease performance by different handling
of truncate and the throughput I'm getting on my machine is the same (OK,
is lower by 0.5%) if I disable the code in truncate waiting for commit to
finish... Also the throughput of dbench is about 2% better with my patch
than with current JBD.
Any comments or testing most welcome.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
---
Signed-off-by: Jan Kara <[email protected]>
diff --git a/fs/buffer.c b/fs/buffer.c
index 897cd74..bd6aefd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1675,7 +1675,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
- } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+ } else if (!buffer_mapped(bh) && buffer_dirty(bh)
+ && !wbc->skip_unmapped) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, 1);
if (err)
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 4f4020c..8530e5d 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -588,6 +588,7 @@ got:
ei->i_extra_isize =
(EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
+ journal_init_jbd_inode(&ei->jinode, inode);
ret = inode;
if(DQUOT_ALLOC_INODE(inode)) {
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index eb95670..b3d933b 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -40,6 +40,7 @@
#include "acl.h"
static int ext3_writepage_trans_blocks(struct inode *inode);
+static int ext3_begin_ordered_truncate(struct inode *inode, loff_t new_size);
/*
* Test whether an inode is a fast symlink.
@@ -183,6 +184,8 @@ void ext3_delete_inode (struct inode * inode)
{
handle_t *handle;
+ if (ext3_should_order_data(inode))
+ ext3_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
@@ -1185,14 +1188,9 @@ out:
return ret;
}
-
-int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
+static int ext3_file_inode(handle_t *handle, struct inode *inode)
{
- int err = journal_dirty_data(handle, bh);
- if (err)
- ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
- bh, handle, err);
- return err;
+ return journal_file_inode(handle, &EXT3_I(inode)->jinode);
}
/* For write_end() in data=journal mode */
@@ -1247,8 +1245,8 @@ static int ext3_ordered_write_end(struct file *file,
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, ext3_journal_dirty_data);
+ if (ext3_should_order_data(inode))
+ ret = ext3_file_inode(handle, inode);
if (ret == 0) {
/*
@@ -1398,25 +1396,6 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping,block,ext3_get_block);
}
-static int bget_one(handle_t *handle, struct buffer_head *bh)
-{
- get_bh(bh);
- return 0;
-}
-
-static int bput_one(handle_t *handle, struct buffer_head *bh)
-{
- put_bh(bh);
- return 0;
-}
-
-static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
- if (buffer_mapped(bh))
- return ext3_journal_dirty_data(handle, bh);
- return 0;
-}
-
/*
* Note that we always start a transaction even if we're not journalling
* data. This is to preserve ordering: any hole instantiation within
@@ -1465,15 +1444,11 @@ static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
* We don't honour synchronous mounts for writepage(). That would be
* disastrous. Any write() or metadata operation will sync the fs for
* us.
- *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
*/
static int ext3_ordered_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- struct buffer_head *page_bufs;
handle_t *handle = NULL;
int ret = 0;
int err;
@@ -1487,46 +1462,49 @@ static int ext3_ordered_writepage(struct page *page,
if (ext3_journal_current_handle())
goto out_fail;
- handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
-
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
+ /*
+ * Now there are two different reasons why we can be called:
+ * 1) write out during commit
+ * 2) fsync / writeout to free memory
+ *
+ * In the first case, we just need to write the buffer to disk, in the
+ * second case we may need to do hole filling and attach the inode to
+ * the transaction. Note that even in the first case, we may get an
+ * unmapped buffer (hole fill with data via mmap) but we don't have to
+ * write it - actually, we can't because from a transaction commit we
+ * cannot start a new transaction or we could deadlock.
+ */
+ if (!wbc->skip_unmapped) {
+ handle = ext3_journal_start(inode,
+ ext3_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_fail;
+ }
}
+ else if (!PageMappedToDisk(page))
+ goto out_fail;
+ /* This can go as soon as someone cares about that ;) */
if (!page_has_buffers(page)) {
create_empty_buffers(page, inode->i_sb->s_blocksize,
(1 << BH_Dirty)|(1 << BH_Uptodate));
}
- page_bufs = page_buffers(page);
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bget_one);
ret = block_write_full_page(page, ext3_get_block, wbc);
/*
- * The page can become unlocked at any point now, and
- * truncate can then come in and change things. So we
- * can't touch *page from now on. But *page_bufs is
- * safe due to elevated refcount.
- */
-
- /*
- * And attach them to the current transaction. But only if
- * block_write_full_page() succeeded. Otherwise they are unmapped,
- * and generally junk.
+ * The page can become unlocked at any point now, and truncate can then
+ * come in and change things.
+ * FIXME: Can we get up to delete? If so we should prevent that...
*/
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
- NULL, journal_dirty_data_fn);
+ if (ret == 0 && handle)
+ ret = ext3_file_inode(handle, inode);
+ if (handle) {
+ err = ext3_journal_stop(handle);
if (!ret)
ret = err;
}
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bput_one);
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
return ret;
out_fail:
@@ -1902,7 +1880,7 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
err = ext3_journal_dirty_metadata(handle, bh);
} else {
if (ext3_should_order_data(inode))
- err = ext3_journal_dirty_data(handle, bh);
+ err = ext3_file_inode(handle, inode);
mark_buffer_dirty(bh);
}
@@ -2676,6 +2654,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
ei->i_default_acl = EXT3_ACL_NOT_CACHED;
#endif
ei->i_block_alloc_info = NULL;
+ journal_init_jbd_inode(&ei->jinode, inode);
ret = __ext3_get_inode_loc(inode, &iloc, 0);
if (ret < 0)
@@ -2974,6 +2953,11 @@ int ext3_write_inode(struct inode *inode, int wait)
return ext3_force_commit(inode->i_sb);
}
+static int ext3_begin_ordered_truncate(struct inode *inode, loff_t new_size)
+{
+ return journal_begin_ordered_truncate(&EXT3_I(inode)->jinode, new_size);
+}
+
/*
* ext3_setattr()
*
@@ -2989,7 +2973,14 @@ int ext3_write_inode(struct inode *inode, int wait)
* be freed, so we have a strong guarantee that no future commit will
* leave these blocks visible to the user.)
*
- * Called with inode->sem down.
+ * Another thing we have to asure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty buffers which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
*/
int ext3_setattr(struct dentry *dentry, struct iattr *attr)
{
@@ -3032,6 +3023,13 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
handle_t *handle;
+ if (ext3_should_order_data(inode)) {
+ error = ext3_begin_ordered_truncate(inode,
+ attr->ia_size);
+ if (error)
+ goto err_out;
+ }
+
handle = ext3_journal_start(inode, 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 18769cc..bd7eae6 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -520,6 +520,8 @@ static void ext3_clear_inode(struct inode *inode)
EXT3_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
kfree(rsv);
+ journal_release_jbd_inode(EXT3_SB(inode->i_sb)->s_journal,
+ &EXT3_I(inode)->jinode);
}
static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index a5432bb..9ef048a 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -682,7 +682,6 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
- J_ASSERT(transaction->t_sync_datalist == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index a38c718..0553df2 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,8 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -35,8 +37,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
}
/*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext3 file is truncated, it is possible that some pages are not
+ * sucessfully freed, because they are attached to a committing transaction.
* After the transaction commits, these pages are left on the LRU, with no
* ->mapping, and with attached buffers. These pages are trivially reclaimable
* by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -77,21 +79,6 @@ nope:
__brelse(bh);
}
-/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held. For ranking reasons we must trylock. If we lose, schedule away and
- * return 0. j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
- if (!jbd_trylock_bh_state(bh)) {
- spin_unlock(&journal->j_list_lock);
- schedule();
- return 0;
- }
- return 1;
-}
-
/* Done it all: now write the commit record. We should have
* cleaned up our previous buffers by now, so if we are in abort
* mode we can now just skip the rest of the journal write
@@ -158,119 +145,88 @@ static int journal_write_commit_record(journal_t *journal,
return (ret == -EIO);
}
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the
+ * transaction to disk.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- int i;
+ struct jbd_inode *jinode;
+ int err, ret = 0;
+ struct address_space *mapping;
+ struct writeback_control wbc = {
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ .sync_mode = WB_SYNC_NONE,
+ .skip_unmapped = 1,
+ };
- for (i = 0; i < bufs; i++) {
- wbuf[i]->b_end_io = end_buffer_write_sync;
- /* We use-up our safety reference in submit_bh() */
- submit_bh(WRITE, wbuf[i]);
+ /*
+ * We are in a committing transaction. Therefore no new inode
+ * can be added to our inode list. We use JI_COMMIT_RUNNING
+ * flag to protect inode we currently operate on from being
+ * released while we write out pages.
+ */
+ spin_lock(&journal->j_list_lock);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ mapping = jinode->i_vfs_inode->i_mapping;
+ if (!mapping_cap_writeback_dirty(mapping))
+ continue;
+ wbc.nr_to_write = mapping->nrpages * 2;
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ err = do_writepages(jinode->i_vfs_inode->i_mapping, &wbc);
+ if (!ret)
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
+ spin_unlock(&journal->j_list_lock);
+ return ret;
}
/*
- * Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
*/
-static void journal_submit_data_buffers(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_finish_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- struct journal_head *jh;
- struct buffer_head *bh;
- int locked;
- int bufs = 0;
- struct buffer_head **wbuf = journal->j_wbuf;
+ struct jbd_inode *jinode, *next_i;
+ int err, ret = 0;
- /*
- * Whenever we unlock the journal and sleep, things can get added
- * onto ->t_sync_datalist, so we have to keep looping back to
- * write_out_data until we *know* that the list is empty.
- *
- * Cleanup any flushed data buffers from the data list. Even in
- * abort mode, we want to flush this out as soon as possible.
- */
-write_out_data:
- cond_resched();
+ /* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+ if (!ret)
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ }
- while (commit_transaction->t_sync_datalist) {
- jh = commit_transaction->t_sync_datalist;
- bh = jh2bh(jh);
- locked = 0;
-
- /* Get reference just to make sure buffer does not disappear
- * when we are forced to drop various locks */
- get_bh(bh);
- /* If the buffer is dirty, we need to submit IO and hence
- * we need the buffer lock. We try to lock the buffer without
- * blocking. If we fail, we need to drop j_list_lock and do
- * blocking lock_buffer().
- */
- if (buffer_dirty(bh)) {
- if (test_set_buffer_locked(bh)) {
- BUFFER_TRACE(bh, "needs blocking lock");
- spin_unlock(&journal->j_list_lock);
- /* Write out all data to prevent deadlocks */
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- lock_buffer(bh);
- spin_lock(&journal->j_list_lock);
- }
- locked = 1;
- }
- /* We have to get bh_state lock. Again out of order, sigh. */
- if (!inverted_lock(journal, bh)) {
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- }
- /* Someone already cleaned up the buffer? */
- if (!buffer_jbd(bh)
- || jh->b_transaction != commit_transaction
- || jh->b_jlist != BJ_SyncData) {
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "already cleaned up");
- put_bh(bh);
- continue;
- }
- if (locked && test_clear_buffer_dirty(bh)) {
- BUFFER_TRACE(bh, "needs writeout, adding to array");
- wbuf[bufs++] = bh;
- __journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- if (bufs == journal->j_wbufsize) {
- spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- goto write_out_data;
- }
- } else if (!locked && buffer_locked(bh)) {
- __journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- put_bh(bh);
- } else {
- BUFFER_TRACE(bh, "writeout complete: unfile");
- __journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- journal_remove_journal_head(bh);
- /* Once for our safety reference, once for
- * journal_remove_journal_head() */
- put_bh(bh);
- put_bh(bh);
- }
-
- if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
- spin_unlock(&journal->j_list_lock);
- goto write_out_data;
+ /* Now refile inode to proper lists */
+ list_for_each_entry_safe(jinode, next_i,
+ &commit_transaction->t_inode_list, i_list) {
+ list_del(&jinode->i_list);
+ if (jinode->i_next_transaction) {
+ jinode->i_transaction = jinode->i_next_transaction;
+ jinode->i_next_transaction = NULL;
+ list_add(&jinode->i_list,
+ &jinode->i_transaction->t_inode_list);
}
+ else
+ jinode->i_transaction = NULL;
}
spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
+
+ return ret;
}
/*
@@ -426,44 +382,7 @@ void journal_commit_transaction(journal_t *journal)
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
- err = 0;
- journal_submit_data_buffers(journal, commit_transaction);
-
- /*
- * Wait for all previously submitted IO to complete.
- */
- spin_lock(&journal->j_list_lock);
- while (commit_transaction->t_locked_list) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_locked_list->b_tprev;
- bh = jh2bh(jh);
- get_bh(bh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- if (unlikely(!buffer_uptodate(bh)))
- err = -EIO;
- spin_lock(&journal->j_list_lock);
- }
- if (!inverted_lock(journal, bh)) {
- put_bh(bh);
- spin_lock(&journal->j_list_lock);
- continue;
- }
- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
- __journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- journal_remove_journal_head(bh);
- put_bh(bh);
- } else {
- jbd_unlock_bh_state(bh);
- }
- put_bh(bh);
- cond_resched_lock(&journal->j_list_lock);
- }
- spin_unlock(&journal->j_list_lock);
-
+ err = journal_submit_data_buffers(journal, commit_transaction);
if (err)
journal_abort(journal, err);
@@ -472,22 +391,13 @@ void journal_commit_transaction(journal_t *journal)
jbd_debug(3, "JBD: commit phase 2\n");
/*
- * If we found any dirty or locked buffers, then we should have
- * looped back up to the write_out_data label. If there weren't
- * any then journal_clean_data_list should have wiped the list
- * clean by now, so check that it is in fact empty.
- */
- J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
- jbd_debug (3, "JBD: commit phase 3\n");
-
- /*
* Way to go: we have now written out all of the data for a
* transaction! Now comes the tricky part: we need to write out
* metadata. Loop over the transaction's entire buffer list:
*/
commit_transaction->t_state = T_COMMIT;
+ err = 0;
descriptor = NULL;
bufs = 0;
while (commit_transaction->t_buffers) {
@@ -655,7 +565,14 @@ start_journal_io:
so we incur less scheduling load.
*/
- jbd_debug(3, "JBD: commit phase 4\n");
+ jbd_debug(3, "JBD: commit phase 3\n");
+
+ /*
+ * First wait for data buffers.
+ */
+ err = journal_finish_data_buffers(journal, commit_transaction);
+ if (err)
+ journal_abort(journal, err);
/*
* akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -714,7 +631,7 @@ wait_for_iobuf:
J_ASSERT (commit_transaction->t_shadow_list == NULL);
- jbd_debug(3, "JBD: commit phase 5\n");
+ jbd_debug(3, "JBD: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
wait_for_ctlbuf:
@@ -741,7 +658,7 @@ wait_for_iobuf:
/* AKPM: bforget here */
}
- jbd_debug(3, "JBD: commit phase 6\n");
+ jbd_debug(3, "JBD: commit phase 5\n");
if (journal_write_commit_record(journal, commit_transaction))
err = -EIO;
@@ -754,9 +671,9 @@ wait_for_iobuf:
transaction can be removed from any checkpoint list it was on
before. */
- jbd_debug(3, "JBD: commit phase 7\n");
+ jbd_debug(3, "JBD: commit phase 6\n");
- J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -876,7 +793,7 @@ restart_loop:
/* Done with this transaction! */
- jbd_debug(3, "JBD: commit phase 8\n");
+ jbd_debug(3, "JBD: commit phase 7\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 3943a89..6723f9e 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1852,6 +1852,51 @@ void journal_put_journal_head(struct journal_head *jh)
}
/*
+ * Initialize jbd inode head
+ */
+void journal_init_jbd_inode(struct jbd_inode *jinode, struct inode *inode)
+{
+ jinode->i_transaction = NULL;
+ jinode->i_next_transaction = NULL;
+ jinode->i_vfs_inode = inode;
+ jinode->i_flags = 0;
+ INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void journal_release_jbd_inode(journal_t *journal, struct jbd_inode *jinode)
+{
+ int writeout = 0;
+
+restart:
+ spin_lock(&journal->j_list_lock);
+ /* Is commit writing out inode - we have to wait */
+ if (jinode->i_flags & JI_COMMIT_RUNNING) {
+ wait_queue_head_t *wq;
+ DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+ wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&journal->j_list_lock);
+ schedule();
+ finish_wait(wq, &wait.wait);
+ goto restart;
+ }
+
+ /* Do we need to wait for data writeback? */
+ if (journal->j_committing_transaction == jinode->i_transaction)
+ writeout = 1;
+ if (jinode->i_transaction) {
+ list_del(&jinode->i_list);
+ jinode->i_transaction = NULL;
+ }
+ spin_unlock(&journal->j_list_lock);
+}
+
+/*
* debugfs tunables
*/
#ifdef CONFIG_JBD_DEBUG
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c6cbb6c..930d5af 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/highmem.h>
+#include <linux/writeback.h> /* For WB_SYNC_ALL :( */
static void __journal_temp_unlink_buffer(struct journal_head *jh);
@@ -52,6 +53,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
+ INIT_LIST_HEAD(&transaction->t_inode_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -920,185 +922,6 @@ out:
}
/**
- * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * Description:
- * Mark a buffer as containing dirty data which needs to be flushed before
- * we can commit the current transaction.
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- journal_t *journal = handle->h_transaction->t_journal;
- int need_brelse = 0;
- struct journal_head *jh;
-
- if (is_handle_aborted(handle))
- return 0;
-
- jh = journal_add_journal_head(bh);
- JBUFFER_TRACE(jh, "entry");
-
- /*
- * The buffer could *already* be dirty. Writeout can start
- * at any time.
- */
- jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
- /*
- * What if the buffer is already part of a running transaction?
- *
- * There are two cases:
- * 1) It is part of the current running transaction. Refile it,
- * just in case we have allocated it as metadata, deallocated
- * it, then reallocated it as data.
- * 2) It is part of the previous, still-committing transaction.
- * If all we want to do is to guarantee that the buffer will be
- * written to disk before this new transaction commits, then
- * being sure that the *previous* transaction has this same
- * property is sufficient for us! Just leave it on its old
- * transaction.
- *
- * In case (2), the buffer must not already exist as metadata
- * --- that would violate write ordering (a transaction is free
- * to write its data at any point, even before the previous
- * committing transaction has committed). The caller must
- * never, ever allow this to happen: there's nothing we can do
- * about it in this layer.
- */
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
-
- /* Now that we have bh_state locked, are we really still mapped? */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
- goto no_journal;
- }
-
- if (jh->b_transaction) {
- JBUFFER_TRACE(jh, "has transaction");
- if (jh->b_transaction != handle->h_transaction) {
- JBUFFER_TRACE(jh, "belongs to older transaction");
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_committing_transaction);
-
- /* @@@ IS THIS TRUE ? */
- /*
- * Not any more. Scenario: someone does a write()
- * in data=journal mode. The buffer's transaction has
- * moved into commit. Then someone does another
- * write() to the file. We do the frozen data copyout
- * and set b_next_transaction to point to j_running_t.
- * And while we're in that state, someone does a
- * writepage() in an attempt to pageout the same area
- * of the file via a shared mapping. At present that
- * calls journal_dirty_data(), and we get right here.
- * It may be too late to journal the data. Simply
- * falling through to the next test will suffice: the
- * data will be dirty and wil be checkpointed. The
- * ordering comments in the next comment block still
- * apply.
- */
- //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
- /*
- * If we're journalling data, and this buffer was
- * subject to a write(), it could be metadata, forget
- * or shadow against the committing transaction. Now,
- * someone has dirtied the same darn page via a mapping
- * and it is being writepage()'d.
- * We *could* just steal the page from commit, with some
- * fancy locking there. Instead, we just skip it -
- * don't tie the page's buffers to the new transaction
- * at all.
- * Implication: if we crash before the writepage() data
- * is written into the filesystem, recovery will replay
- * the write() data.
- */
- if (jh->b_jlist != BJ_None &&
- jh->b_jlist != BJ_SyncData &&
- jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "Not stealing");
- goto no_journal;
- }
-
- /*
- * This buffer may be undergoing writeout in commit. We
- * can't return from here and let the caller dirty it
- * again because that can cause the write-out loop in
- * commit to never terminate.
- */
- if (buffer_dirty(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- need_brelse = 1;
- sync_dirty_buffer(bh);
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- /* Since we dropped the lock... */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "buffer got unmapped");
- goto no_journal;
- }
- /* The buffer may become locked again at any
- time if it is redirtied */
- }
-
- /* journal_clean_data_list() may have got there first */
- if (jh->b_transaction != NULL) {
- JBUFFER_TRACE(jh, "unfile from commit");
- __journal_temp_unlink_buffer(jh);
- /* It still points to the committing
- * transaction; move it to this one so
- * that the refile assert checks are
- * happy. */
- jh->b_transaction = handle->h_transaction;
- }
- /* The buffer will be refiled below */
-
- }
- /*
- * Special case --- the buffer might actually have been
- * allocated and then immediately deallocated in the previous,
- * committing transaction, so might still be left on that
- * transaction's metadata lists.
- */
- if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "not on correct data list: unfile");
- J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
- __journal_temp_unlink_buffer(jh);
- jh->b_transaction = handle->h_transaction;
- JBUFFER_TRACE(jh, "file as data");
- __journal_file_buffer(jh, handle->h_transaction,
- BJ_SyncData);
- }
- } else {
- JBUFFER_TRACE(jh, "not on a transaction");
- __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
- }
-no_journal:
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- if (need_brelse) {
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- }
- JBUFFER_TRACE(jh, "exit");
- journal_put_journal_head(jh);
- return 0;
-}
-
-/**
* int journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
@@ -1504,10 +1327,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
+ * of thee pointers, it could go bad. Generally the caller needs to re-read
+ * the pointer from the transaction_t.
*
* Called under j_list_lock. The journal may not be locked.
*/
@@ -1529,9 +1352,6 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh)
switch (jh->b_jlist) {
case BJ_None:
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers--;
J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1552,9 +1372,6 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh)
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_del_buffer(list, jh);
@@ -1597,15 +1414,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
goto out;
spin_lock(&journal->j_list_lock);
- if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
- if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
- /* A written-back ordered data buffer */
- JBUFFER_TRACE(jh, "release data");
- __journal_unfile_buffer(jh);
- journal_remove_journal_head(bh);
- __brelse(bh);
- }
- } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+ if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
/* written-back checkpointed metadata buffer */
if (jh->b_jlist == BJ_None) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1786,6 +1595,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
if (!buffer_jbd(bh))
goto zap_buffer_unlocked;
+ /* OK, we have data buffer in journaled mode */
spin_lock(&journal->j_state_lock);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
@@ -1849,15 +1659,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
} else if (transaction == journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "on committing transaction");
- if (jh->b_jlist == BJ_Locked) {
- /*
- * The buffer is on the committing transaction's locked
- * list. We have the buffer locked, so I/O has
- * completed. So we can nail the buffer now.
- */
- may_free = __dispose_buffer(jh, transaction);
- goto zap_buffer;
- }
/*
* If it is committing, we simply cannot touch it. We
* can remove it's next_transaction pointer from the
@@ -1990,9 +1791,6 @@ void __journal_file_buffer(struct journal_head *jh,
J_ASSERT_JH(jh, !jh->b_committed_data);
J_ASSERT_JH(jh, !jh->b_frozen_data);
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers++;
list = &transaction->t_buffers;
@@ -2012,9 +1810,6 @@ void __journal_file_buffer(struct journal_head *jh,
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_add_buffer(list, jh);
@@ -2104,3 +1899,70 @@ void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
spin_unlock(&journal->j_list_lock);
__brelse(bh);
}
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int journal_file_inode(handle_t *handle, struct jbd_inode *jinode)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+
+ if (is_handle_aborted(handle))
+ return 0;
+
+ jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+ transaction->t_tid);
+
+ spin_lock(&journal->j_list_lock);
+
+ if (jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction)
+ goto done;
+
+ /* On some different transaction's list - should be
+ * the committing one */
+ if (jinode->i_transaction) {
+ J_ASSERT(jinode->i_next_transaction == NULL);
+ J_ASSERT(jinode->i_transaction ==
+ journal->j_committing_transaction);
+ jinode->i_next_transaction = transaction;
+ goto done;
+ }
+ /* Not on any transaction list... */
+ J_ASSERT(!jinode->i_next_transaction);
+ jinode->i_transaction = transaction;
+ list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+ spin_unlock(&journal->j_list_lock);
+
+ return 0;
+}
+
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int journal_begin_ordered_truncate(struct jbd_inode *inode, loff_t new_size)
+{
+ journal_t *journal;
+ transaction_t *commit_trans;
+ int ret = 0;
+
+ if (!inode->i_transaction && !inode->i_next_transaction)
+ goto out;
+ journal = inode->i_transaction->t_journal;
+ spin_lock(&journal->j_state_lock);
+ commit_trans = journal->j_committing_transaction;
+ spin_unlock(&journal->j_state_lock);
+ if (inode->i_transaction == commit_trans) {
+ ret = __filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+ new_size, LLONG_MAX, WB_SYNC_ALL);
+ if (ret)
+ journal_abort(journal, ret);
+ }
+out:
+ return ret;
+}
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3..4f66bae 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -527,7 +527,10 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
map_bh.b_state = 0;
map_bh.b_size = 1 << blkbits;
- if (mpd->get_block(inode, block_in_file, &map_bh, 1))
+ if (mpd->get_block(inode, block_in_file, &map_bh,
+ !wbc->skip_unmapped))
+ goto confused;
+ if (!buffer_mapped(&map_bh))
goto confused;
if (buffer_new(&map_bh))
unmap_underlying_metadata(map_bh.b_bdev,
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 36c5403..7aa8327 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -826,6 +826,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
extern struct inode *ext3_iget(struct super_block *, unsigned long);
extern int ext3_write_inode (struct inode *, int);
extern int ext3_setattr (struct dentry *, struct iattr *);
+extern void ext3_drop_inode (struct inode *);
extern void ext3_delete_inode (struct inode *);
extern int ext3_sync_inode (handle_t *, struct inode *);
extern void ext3_discard_reservation (struct inode *);
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index 7894dd0..afd77b0 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -142,6 +142,7 @@ struct ext3_inode_info {
*/
struct mutex truncate_mutex;
struct inode vfs_inode;
+ struct jbd_inode jinode;
};
#endif /* _LINUX_EXT3_FS_I */
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index b18fd3b..847328c 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -345,6 +345,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
bit_spin_unlock(BH_JournalHead, &bh->b_state);
}
+/* Flags in jbd_inode->i_flags */
+#define __JI_COMMIT_RUNNING 0
+/* Commit of the inode data in progress. We use this flag to protect us from
+ * concurrent deletion of inode. We cannot use reference to inode for this
+ * since we cannot afford doing last iput() on behalf of kjournald
+ */
+#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
+
+/**
+ * struct jbd_inode is the structure linking inodes in ordered mode
+ * present in a transaction so that we can sync them during commit.
+ */
+struct jbd_inode {
+ /* Which transaction does this inode belong to? Either the running
+ * transaction or the committing one. [j_list_lock] */
+ transaction_t *i_transaction;
+
+ /* Pointer to the running transaction modifying inode's data in case
+ * there is already a committing transaction touching it. [j_list_lock] */
+ transaction_t *i_next_transaction;
+
+ /* List of inodes in the i_transaction [j_list_lock] */
+ struct list_head i_list;
+
+ /* VFS inode this inode belongs to [constant during the lifetime
+ * of the structure] */
+ struct inode *i_vfs_inode;
+
+ /* Flags of inode [j_list_lock] */
+ unsigned int i_flags;
+};
+
struct jbd_revoke_table_s;
/**
@@ -466,24 +498,12 @@ struct transaction_s
struct journal_head *t_reserved_list;
/*
- * Doubly-linked circular list of all buffers under writeout during
- * commit [j_list_lock]
- */
- struct journal_head *t_locked_list;
-
- /*
* Doubly-linked circular list of all metadata buffers owned by this
* transaction [j_list_lock]
*/
struct journal_head *t_buffers;
/*
- * Doubly-linked circular list of all data buffers still to be
- * flushed before this transaction can be committed [j_list_lock]
- */
- struct journal_head *t_sync_datalist;
-
- /*
* Doubly-linked circular list of all forget buffers (superseded
* buffers which we can un-checkpoint once this transaction commits)
* [j_list_lock]
@@ -522,6 +542,12 @@ struct transaction_s
struct journal_head *t_log_list;
/*
+ * List of inodes whose data we've modified in data=ordered mode.
+ * [j_list_lock]
+ */
+ struct list_head t_inode_list;
+
+ /*
* Protects info related to handles
*/
spinlock_t t_handle_lock;
@@ -928,6 +954,10 @@ extern void journal_ack_err (journal_t *);
extern int journal_clear_err (journal_t *);
extern int journal_bmap(journal_t *, unsigned long, unsigned long *);
extern int journal_force_commit(journal_t *);
+int journal_file_inode(handle_t *handle, struct jbd_inode *inode);
+int journal_begin_ordered_truncate(struct jbd_inode *inode, loff_t new_size);
+void journal_init_jbd_inode(struct jbd_inode *jinode, struct inode *inode);
+void journal_release_jbd_inode(journal_t *journal, struct jbd_inode *jinode);
/*
* journal_head management
@@ -1063,15 +1093,13 @@ static inline int jbd_space_needed(journal_t *journal)
/* journaling buffer types */
#define BJ_None 0 /* Not journaled */
-#define BJ_SyncData 1 /* Normal data: flush before commit */
-#define BJ_Metadata 2 /* Normal journaled metadata */
-#define BJ_Forget 3 /* Buffer superseded by this transaction */
-#define BJ_IO 4 /* Buffer is for temporary IO use */
-#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */
-#define BJ_LogCtl 6 /* Buffer contains log descriptors */
-#define BJ_Reserved 7 /* Buffer is reserved for access by journal */
-#define BJ_Locked 8 /* Locked for I/O during commit */
-#define BJ_Types 9
+#define BJ_Metadata 1 /* Normal journaled metadata */
+#define BJ_Forget 2 /* Buffer superseded by this transaction */
+#define BJ_IO 3 /* Buffer is for temporary IO use */
+#define BJ_Shadow 4 /* Buffer contents being shadowed to the log */
+#define BJ_LogCtl 5 /* Buffer contains log descriptors */
+#define BJ_Reserved 6 /* Buffer is reserved for access by journal */
+#define BJ_Types 7
extern int jbd_blocks_per_page(struct inode *inode);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b7b3362..2725e8b 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,6 +63,7 @@ struct writeback_control {
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */
+ unsigned skip_unmapped:1; /* A kjournald commit */
};
/*
On Thursday 06 March 2008 12:42:09 pm Jan Kara wrote:
> Hi,
>
> Below is my rewrite of ordered mode in JBD. Now we don't have a list of
> data buffers that need syncing on transaction commit but a list of inodes
> that need writeout during commit. This brings all sorts of advantages such
> as possibility to get rid of journal heads and buffer heads for data
> buffers in ordered mode, better ordering of writes on transaction commit,
> simplification of some JBD code, no more anonymous pages when truncate of
> data being committed happens. The patch has survived some light testing
> but it still has some potential of eating your data so beware :) I've run
> dbench to see whether we didn't decrease performance by different handling
> of truncate and the throughput I'm getting on my machine is the same (OK,
> is lower by 0.5%) if I disable the code in truncate waiting for commit to
> finish... Also the throughput of dbench is about 2% better with my patch
> than with current JBD.
> Any comments or testing most welcome.
>
> Honza
Just one nit, doesn't compile properly when jbd/ext3 are modules :). Thanks
much,
Josef
On Thu, 6 Mar 2008 18:42:09 +0100
Jan Kara <[email protected]> wrote:
> Below is my rewrite of ordered mode in JBD. Now we don't have a list of
> data buffers that need syncing on transaction commit but a list of inodes
> that need writeout during commit. This brings all sorts of advantages such
> as possibility to get rid of journal heads and buffer heads for data
> buffers in ordered mode, better ordering of writes on transaction commit,
> simplification of some JBD code, no more anonymous pages when truncate of
> data being committed happens. The patch has survived some light testing
> but it still has some potential of eating your data so beware :) I've run
> dbench to see whether we didn't decrease performance by different handling
> of truncate and the throughput I'm getting on my machine is the same (OK,
> is lower by 0.5%) if I disable the code in truncate waiting for commit to
> finish... Also the throughput of dbench is about 2% better with my patch
> than with current JBD.
> Any comments or testing most welcome.
Thanks for plugging away with this.
Please change your patch preparation tools to always always include a
diffstat, OK?
fs/buffer.c | 3
fs/ext3/ialloc.c | 1
fs/ext3/inode.c | 118 +++++++++---------
fs/ext3/super.c | 2
fs/jbd/checkpoint.c | 1
fs/jbd/commit.c | 257 +++++++++++++----------------------------
fs/jbd/journal.c | 45 +++++++
fs/jbd/transaction.c | 288 +++++++++++-----------------------------------
fs/mpage.c | 5
include/linux/ext3_fs.h | 1
include/linux/ext3_fs_i.h | 1
include/linux/jbd.h | 70 +++++++----
include/linux/writeback.h | 2
13 files changed, 326 insertions(+), 468 deletions(-)
Would it make sense to turn this patch into a patch series sometime?
On Thu, Mar 06, 2008 at 06:42:09PM +0100, Jan Kara wrote:
> Below is my rewrite of ordered mode in JBD. Now we don't have a list of
> data buffers that need syncing on transaction commit but a list of inodes
> that need writeout during commit. This brings all sorts of advantages such
> as possibility to get rid of journal heads and buffer heads for data
> buffers in ordered mode, better ordering of writes on transaction commit,
> simplification of some JBD code, no more anonymous pages when truncate of
> data being committed happens.
There's a lot of JBD code which gets removed by this patch - cool :)
> The patch has survived some light testing but it still has some potential
> of eating your data so beware :)
Looking through the patch, I don't see how you solve the page lock /
transaction ordering issues. I see that you avoid starting a handle in
->writepage during transaction commit, but what about another process which
starts a handle under page lock and needs to wait for transactions to be
written out before continuing?
> I've run dbench to see whether we didn't decrease performance by different
> handling of truncate and the throughput I'm getting on my machine is the
> same (OK, is lower by 0.5%) if I disable the code in truncate waiting for
> commit to finish...
> Also the throughput of dbench is about 2% better with my patch than with
> current JBD.
> Any comments or testing most welcome.
My attempt at helpful review follows.
> @@ -1465,15 +1444,11 @@ static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
> * We don't honour synchronous mounts for writepage(). That would be
> * disastrous. Any write() or metadata operation will sync the fs for
> * us.
> - *
> - * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
> - * we don't need to open a transaction here.
> */
> static int ext3_ordered_writepage(struct page *page,
> struct writeback_control *wbc)
> {
> struct inode *inode = page->mapping->host;
> - struct buffer_head *page_bufs;
> handle_t *handle = NULL;
> int ret = 0;
> int err;
> @@ -1487,46 +1462,49 @@ static int ext3_ordered_writepage(struct page *page,
> if (ext3_journal_current_handle())
> goto out_fail;
>
> - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
> -
> - if (IS_ERR(handle)) {
> - ret = PTR_ERR(handle);
> - goto out_fail;
> + /*
> + * Now there are two different reasons why we can be called:
> + * 1) write out during commit
> + * 2) fsync / writeout to free memory
> + *
> + * In the first case, we just need to write the buffer to disk, in the
> + * second case we may need to do hole filling and attach the inode to
> + * the transaction. Note that even in the first case, we may get an
> + * unmapped buffer (hole fill with data via mmap) but we don't have to
> + * write it - actually, we can't because from a transaction commit we
> + * cannot start a new transaction or we could deadlock.
What happens to the data if we get here under the 1st case with a hole? Do
we eventually fill the hole (with correct data) via some mechanism I don't
see here?
<snip>
> +/*
> + * This function must be called when inode is journaled in ordered mode
> + * before truncation happens. It starts writeout of truncated part in
> + * case it is in the committing transaction so that we stand to ordered
> + * mode consistency guarantees.
> + */
> +int journal_begin_ordered_truncate(struct jbd_inode *inode, loff_t new_size)
> +{
> + journal_t *journal;
> + transaction_t *commit_trans;
> + int ret = 0;
> +
> + if (!inode->i_transaction && !inode->i_next_transaction)
> + goto out;
> + journal = inode->i_transaction->t_journal;
> + spin_lock(&journal->j_state_lock);
> + commit_trans = journal->j_committing_transaction;
> + spin_unlock(&journal->j_state_lock);
> + if (inode->i_transaction == commit_trans) {
AFAICT, this is called in ext3 before a handle is started for truncate. Is
it possible for the current running transaction to become the new committing
transaction shortly after the spinlock is dropped, but before the truncate
transaction starts? Could this lead to ordered data not being written out if
the inode is part of the current running transaction but not part of the
committing transaction?
> + ret = __filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
> + new_size, LLONG_MAX, WB_SYNC_ALL);
> + if (ret)
> + journal_abort(journal, ret);
> + }
> +out:
> + return ret;
> +}
> diff --git a/fs/mpage.c b/fs/mpage.c
> index 235e4d3..4f66bae 100644
> --- a/fs/mpage.c
> +++ b/fs/mpage.c
> @@ -527,7 +527,10 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
>
> map_bh.b_state = 0;
> map_bh.b_size = 1 << blkbits;
> - if (mpd->get_block(inode, block_in_file, &map_bh, 1))
> + if (mpd->get_block(inode, block_in_file, &map_bh,
> + !wbc->skip_unmapped))
> + goto confused;
> + if (!buffer_mapped(&map_bh))
> goto confused;
> if (buffer_new(&map_bh))
> unmap_underlying_metadata(map_bh.b_bdev,
> diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
> index 36c5403..7aa8327 100644
> --- a/include/linux/ext3_fs.h
> +++ b/include/linux/ext3_fs.h
> @@ -826,6 +826,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
> extern struct inode *ext3_iget(struct super_block *, unsigned long);
> extern int ext3_write_inode (struct inode *, int);
> extern int ext3_setattr (struct dentry *, struct iattr *);
> +extern void ext3_drop_inode (struct inode *);
Not sure what this is here for...
Thanks,
--Mark
--
Mark Fasheh
Principal Software Developer, Oracle
[email protected]
On Thu, 2008-03-06 at 18:42 +0100, Jan Kara wrote:
> Hi,
>
Hi Jan,
> Below is my rewrite of ordered mode in JBD. Now we don't have a list of
> data buffers that need syncing on transaction commit but a list of inodes
> that need writeout during commit. This brings all sorts of advantages such
> as possibility to get rid of journal heads and buffer heads for data
> buffers in ordered mode, better ordering of writes on transaction commit,
> simplification of some JBD code, no more anonymous pages when truncate of
> data being committed happens. The patch has survived some light testing
> but it still has some potential of eating your data so beware :) I've run
> dbench to see whether we didn't decrease performance by different handling
> of truncate and the throughput I'm getting on my machine is the same (OK,
> is lower by 0.5%) if I disable the code in truncate waiting for commit to
> finish... Also the throughput of dbench is about 2% better with my patch
> than with current JBD.
I know ext4 is keep changing that it's a bit hard to create patch
against ext4, but I feel features like especially rewrite the default
ordered mode should done in ext4/jbd2. I could port to current ext4 and
JBD2 if you agrees with this.
Also, would it make sense to create a new ordered mode writepage
routines, and keep the old ordered mode code there for a while, to allow
easy comparison? This could a good transition for people to start
experiment this ordered mode without worrying about put data in danger
by default.
> Any comments or testing most welcom
[...]
> /*
> * Note that we always start a transaction even if we're not journalling
> * data. This is to preserve ordering: any hole instantiation within
> @@ -1465,15 +1444,11 @@ static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
> * We don't honour synchronous mounts for writepage(). That would be
> * disastrous. Any write() or metadata operation will sync the fs for
> * us.
> - *
> - * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
> - * we don't need to open a transaction here.
> */
> static int ext3_ordered_writepage(struct page *page,
> struct writeback_control *wbc)
> {
> struct inode *inode = page->mapping->host;
> - struct buffer_head *page_bufs;
> handle_t *handle = NULL;
> int ret = 0;
> int err;
> @@ -1487,46 +1462,49 @@ static int ext3_ordered_writepage(struct page *page,
> if (ext3_journal_current_handle())
> goto out_fail;
>
> - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
> -
> - if (IS_ERR(handle)) {
> - ret = PTR_ERR(handle);
> - goto out_fail;
> + /*
> + * Now there are two different reasons why we can be called:
> + * 1) write out during commit
> + * 2) fsync / writeout to free memory
> + *
> + * In the first case, we just need to write the buffer to disk, in the
> + * second case we may need to do hole filling and attach the inode to
> + * the transaction. Note that even in the first case, we may get an
> + * unmapped buffer (hole fill with data via mmap) but we don't have to
> + * write it - actually, we can't because from a transaction commit we
> + * cannot start a new transaction or we could deadlock.
> + */
Any thoughts how to handle the unmapped page under case 1)? Right now I
see it fails. Your comments here saying that we still have the issue
that "can't start a new transaction while commiting", but likely, with
delayed allocation, starting a new transaction could to happen a lot to
do defered block allocation.
I really hope this new mode could be easy to add delayed allocation
support. Any thoughts that we could workaround the locking in the JBD2
layer?
> + if (!wbc->skip_unmapped) {
> + handle = ext3_journal_start(inode,
> + ext3_writepage_trans_blocks(inode));
> + if (IS_ERR(handle)) {
> + ret = PTR_ERR(handle);
> + goto out_fail;
> + }
> }
> + else if (!PageMappedToDisk(page))
> + goto out_fail;
>
Thanks & Regards,
Mingming
On Mar 06, 2008 18:42 +0100, Jan Kara wrote:
> Below is my rewrite of ordered mode in JBD. Now we don't have a list of
> data buffers that need syncing on transaction commit but a list of inodes
> that need writeout during commit. This brings all sorts of advantages such
> as possibility to get rid of journal heads and buffer heads for data
> buffers in ordered mode, better ordering of writes on transaction commit,
> simplification of some JBD code, no more anonymous pages when truncate of
> data being committed happens. The patch has survived some light testing
> but it still has some potential of eating your data so beware :) I've run
> dbench to see whether we didn't decrease performance by different handling
> of truncate and the throughput I'm getting on my machine is the same (OK,
> is lower by 0.5%) if I disable the code in truncate waiting for commit to
> finish... Also the throughput of dbench is about 2% better with my patch
> than with current JBD.
> Any comments or testing most welcome.
Looks like a very good patch - thanks for your effort in moving this
beyond the "hand-waving" stage that it's been in for the past few years.
I'm looking at what implications this has for delayed allocation in ext4,
because the vast majority of file data will be unmapped in that case
and a journal commit in ordered mode will no longer cause the data to
be flushed to disk.
I _think_ is OK, because the pdflushd will now be totally in charge of
flushing the dirty pages to disk, instead of this previously being done
by ordered mode in the journal. I know there have been some bugs in this
area in the past, but I guess it isn't much different than running in
writeback mode. That said, I don't know how many users run in writeback
mode unless they are running a database, and the database is doing a lot
of explicit fsync of file data so there may still be bugs lurking...
Some care is still needed here because with ext4 delayed allocation the
common case will be unmapped buffers, while the ext3 implementation will
only have this in very rare cases (unmapped mmap writes), but it looks
like the right mechanism is already in place for both.
I'll just put a bunch of comments inline, not necessarily problems, just
walking through the code to ensure I understand what is going on...
> @@ -1675,7 +1675,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
> */
> clear_buffer_dirty(bh);
> set_buffer_uptodate(bh);
> - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
> + } else if (!buffer_mapped(bh) && buffer_dirty(bh)
> + && !wbc->skip_unmapped) {
(comment) This is skipping writeout of unmapped pages during journal commit -
good, otherwise we would deadlock trying to add block mappings...
> @@ -183,6 +184,8 @@ void ext3_delete_inode (struct inode * inode)
> {
> handle_t *handle;
>
> + if (ext3_should_order_data(inode))
> + ext3_begin_ordered_truncate(inode, 0);
(comment) This is flushing out pages allocated in the previous transaction
to keep consistent semantics in case of a crash loses the delete... There
is some possibility for optimization here - comments below at
journal_begin_ordered_truncate().
> @@ -1487,46 +1462,49 @@ static int ext3_ordered_writepage(struct page *page,
> + if (!wbc->skip_unmapped) {
> + handle = ext3_journal_start(inode,
> + ext3_writepage_trans_blocks(inode));
> + if (IS_ERR(handle)) {
> + ret = PTR_ERR(handle);
> + goto out_fail;
> + }
> }
I guess the common case here for delalloc is that if someone cares to fsync
the file then that inode would be added to the journal list and the pages
would be mapped here in the process context and flushed with the journal
commit. This is ideal because it avoids syncing out all of the dirty pages
for inodes that were NOT fsync'd and fixes the "one process doing fsync
kills performance for streaming writes" problem in ordered mode that was
recently discussed on the list.
We in fact win twice because fsync_page_range() will potentially only
map and flush the range of pages that the application cares about and
does not necessarily have to write out all pages (though that will still
happen in ordered mode if the pages had previously been mapped).
> + else if (!PageMappedToDisk(page))
> + goto out_fail;
(style) } else if (...) {
goto out_fail;
}
> + /* This can go as soon as someone cares about that ;) */
> if (!page_has_buffers(page)) {
> create_empty_buffers(page, inode->i_sb->s_blocksize,
> (1 << BH_Dirty)|(1 << BH_Uptodate));
> }
Is there any reason to keep this in the non-data-journal case? Adding
buffer heads to every page is a non-trivial amount of RAM usage.
> @@ -2989,7 +2973,14 @@ int ext3_write_inode(struct inode *inode, int wait)
> * be freed, so we have a strong guarantee that no future commit will
> * leave these blocks visible to the user.)
> *
> - * Called with inode->sem down.
> + * Another thing we have to asure is that if we are in ordered mode
s/asure/assure/
> + * and inode is still attached to the committing transaction, we must
> + * we start writeout of all the dirty buffers which are being truncated.
s/buffers/pages/
> @@ -3032,6 +3023,13 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
> attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
> handle_t *handle;
>
> + if (ext3_should_order_data(inode)) {
> + error = ext3_begin_ordered_truncate(inode,
> + attr->ia_size);
(style) align "attr->ia_size" with previous '('.
> @@ -520,6 +520,8 @@ static void ext3_clear_inode(struct inode *inode)
> EXT3_I(inode)->i_block_alloc_info = NULL;
> if (unlikely(rsv))
> kfree(rsv);
> + journal_release_jbd_inode(EXT3_SB(inode->i_sb)->s_journal,
> + &EXT3_I(inode)->jinode);
(style) align with '(' on previous line.
> /*
> - * When an ext3-ordered file is truncated, it is possible that many pages are
> - * not sucessfully freed, because they are attached to a committing transaction.
> + * When an ext3 file is truncated, it is possible that some pages are not
> + * sucessfully freed, because they are attached to a committing transaction.
s/sucessfully/successfully/
> -static int inverted_lock(journal_t *journal, struct buffer_head *bh)
> -{
> - if (!jbd_trylock_bh_state(bh)) {
> - spin_unlock(&journal->j_list_lock);
> - schedule();
> - return 0;
> - }
> - return 1;
> -}
Always nice to see unpleasant code like this be removed.
> +static int journal_submit_data_buffers(journal_t *journal,
> + transaction_t *commit_transaction)
> {
> + /*
> + * We are in a committing transaction. Therefore no new inode
> + * can be added to our inode list. We use JI_COMMIT_RUNNING
> + * flag to protect inode we currently operate on from being
> + * released while we write out pages.
> + */
Should we J_ASSERT() this is true? Probably better to put this comment
before the function instead of inside it.
> + spin_lock(&journal->j_list_lock);
> + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
> + mapping = jinode->i_vfs_inode->i_mapping;
> + if (!mapping_cap_writeback_dirty(mapping))
> + continue;
> + wbc.nr_to_write = mapping->nrpages * 2;
> + jinode->i_flags |= JI_COMMIT_RUNNING;
> + spin_unlock(&journal->j_list_lock);
> + err = do_writepages(jinode->i_vfs_inode->i_mapping, &wbc);
> + if (!ret)
> + ret = err;
> + spin_lock(&journal->j_list_lock);
> + jinode->i_flags &= ~JI_COMMIT_RUNNING;
> + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
> }
> + spin_unlock(&journal->j_list_lock);
Hmm, this is one area I'm a bit worried about. In the old code the number
of buffers to sync for a transaction are bounded because they can only be
added to the transaction while it is open. With this new inode-list code
there could be a process busily adding new dirty + mapped pages to the
inode(s) in the running transaction, while we are blocked here writing
out the pages in kjournald trying to commit the previous transaction.
At some point we will eventually no longer be able to add new pages to the
running transaction (because it is full or was closed due to timeout) and
we won't be able to start a new transaction because the committing transaction
has not yet committed. At that point we would be able to finish the commit
of the previous transaction, but not until we had reached the point of
blocking all operations in the filesystem waiting for the new transaction
to open. This would essentially lead to livelock of the system, with
stop-start-stop cycles for the transactions and IO.
What would be needed is some kind of mechanism to separate the dirty pages
on the inode into "dirty from current transaction" and "dirty from the
previous transaction". That seems non-trivial, however. I guess we could
also force any process doing writing to flush out all of the dirty and
mapped pages on the inode if it detects the inode is in two transactions.
This would at least parallelize the IO submission and also throttle the
addition of new pages until the dirty ones on the committing transaction
had been flushed but may essentially mean sync IO performance for streaming
writes on large files without delalloc (which would not add new pages that
are mapped normally).
> +static int journal_finish_data_buffers(journal_t *journal,
> + transaction_t *commit_transaction)
> {
> + /* Now refile inode to proper lists */
> + list_for_each_entry_safe(jinode, next_i,
> + &commit_transaction->t_inode_list, i_list) {
(style) align with '(' on previous line
> + list_del(&jinode->i_list);
> + if (jinode->i_next_transaction) {
> + jinode->i_transaction = jinode->i_next_transaction;
> + jinode->i_next_transaction = NULL;
> + list_add(&jinode->i_list,
> + &jinode->i_transaction->t_inode_list);
> }
> + else
> + jinode->i_transaction = NULL;
(style) } else {
jinode->i_transaction = NULL;
}
> @@ -655,7 +565,14 @@ start_journal_io:
> so we incur less scheduling load.
> */
>
> - jbd_debug(3, "JBD: commit phase 4\n");
> + jbd_debug(3, "JBD: commit phase 3\n");
Rather than numbering these phases, which has little meaning and often
means they just get renumbered like here, it is probably more useful to
give them descriptive names like "JBD: commit wait for data buffers", etc.
> @@ -1504,10 +1327,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
> * Remove a buffer from the appropriate transaction list.
> *
> * Note that this function can *change* the value of
> - * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
> - * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
> - * is holding onto a copy of one of thee pointers, it could go bad.
> - * Generally the caller needs to re-read the pointer from the transaction_t.
> + * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
> + * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
> + * of thee pointers, it could go bad. Generally the caller needs to re-read
s/thee/these/
> +/*
> + * File inode in the inode list of the handle's transaction
> + */
> +int journal_file_inode(handle_t *handle, struct jbd_inode *jinode)
> +{
> + transaction_t *transaction = handle->h_transaction;
> + journal_t *journal = transaction->t_journal;
> +
> + if (is_handle_aborted(handle))
> + return 0;
Should we be returning an error back to the caller at this point?
> + jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
> + transaction->t_tid);
> +
> + spin_lock(&journal->j_list_lock);
> +
> + if (jinode->i_transaction == transaction ||
> + jinode->i_next_transaction == transaction)
> + goto done;
Is it possible/safe to do the above check outside of the j_list_lock
as an optimization? We will be calling this function for every page
that is dirtied and it will likely be quite highly contended. It used
to be that we HAD to get this lock because we were almost certainly
adding the new buffers to the transaction list, but in this updated
code the common case is that the inode will already be on the list...
> +/*
> + * This function must be called when inode is journaled in ordered mode
> + * before truncation happens. It starts writeout of truncated part in
> + * case it is in the committing transaction so that we stand to ordered
> + * mode consistency guarantees.
> + */
> +int journal_begin_ordered_truncate(struct jbd_inode *inode, loff_t new_size)
> +{
> + journal_t *journal;
> + transaction_t *commit_trans;
> + int ret = 0;
> +
> + if (!inode->i_transaction && !inode->i_next_transaction)
> + goto out;
> + journal = inode->i_transaction->t_journal;
> + spin_lock(&journal->j_state_lock);
> + commit_trans = journal->j_committing_transaction;
> + spin_unlock(&journal->j_state_lock);
> + if (inode->i_transaction == commit_trans) {
> + ret = __filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
> + new_size, LLONG_MAX, WB_SYNC_ALL);
> + if (ret)
> + journal_abort(journal, ret);
> + }
Is there any way here to entirely avoid writing blocks which were just
allocated during the current transaction (e.g. temp files during compile
or whatever)? One possibility is to store the transaction number when
the inode was created (or first dirtied?) in ext3_inode_info and if that is
equal to the committing transaction then we don't need to write anything
at all?
Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.
On Fri, 2008-03-07 at 16:52 -0700, Andreas Dilger wrote:
> On Mar 06, 2008 18:42 +0100, Jan Kara wrote:
> > Below is my rewrite of ordered mode in JBD. Now we don't have a list of
> > data buffers that need syncing on transaction commit but a list of inodes
> > that need writeout during commit. This brings all sorts of advantages such
> > as possibility to get rid of journal heads and buffer heads for data
> > buffers in ordered mode, better ordering of writes on transaction commit,
> > simplification of some JBD code, no more anonymous pages when truncate of
> > data being committed happens. The patch has survived some light testing
> > but it still has some potential of eating your data so beware :) I've run
> > dbench to see whether we didn't decrease performance by different handling
> > of truncate and the throughput I'm getting on my machine is the same (OK,
> > is lower by 0.5%) if I disable the code in truncate waiting for commit to
> > finish... Also the throughput of dbench is about 2% better with my patch
> > than with current JBD.
> > Any comments or testing most welcome.
>
> Looks like a very good patch - thanks for your effort in moving this
> beyond the "hand-waving" stage that it's been in for the past few years.
>
> I'm looking at what implications this has for delayed allocation in ext4,
> because the vast majority of file data will be unmapped in that case
> and a journal commit in ordered mode will no longer cause the data to
> be flushed to disk.
>
> I _think_ is OK, because the pdflushd will now be totally in charge of
> flushing the dirty pages to disk, instead of this previously being done
> by ordered mode in the journal.
I missed something here, just trying to understand: if a journal commit
in ordered mode will no longer cause the data to be flushed to disk, how
could we ensure the ordering? Are you suggesting with delayed allocation
the journalling mode falls back to writeback mode?
> I know there have been some bugs in this
> area in the past, but I guess it isn't much different than running in
> writeback mode. That said, I don't know how many users run in writeback
> mode unless they are running a database, and the database is doing a lot
> of explicit fsync of file data so there may still be bugs lurking...
>
> Some care is still needed here because with ext4 delayed allocation the
> common case will be unmapped buffers, while the ext3 implementation will
> only have this in very rare cases (unmapped mmap writes), but it looks
> like the right mechanism is already in place for both.
>
> I'll just put a bunch of comments inline, not necessarily problems, just
> walking through the code to ensure I understand what is going on...
>
> > @@ -1675,7 +1675,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
> > */
> > clear_buffer_dirty(bh);
> > set_buffer_uptodate(bh);
> > - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
> > + } else if (!buffer_mapped(bh) && buffer_dirty(bh)
> > + && !wbc->skip_unmapped) {
>
> (comment) This is skipping writeout of unmapped pages during journal commit -
> good, otherwise we would deadlock trying to add block mappings...
>
> > @@ -183,6 +184,8 @@ void ext3_delete_inode (struct inode * inode)
> > {
> > handle_t *handle;
> >
> > + if (ext3_should_order_data(inode))
> > + ext3_begin_ordered_truncate(inode, 0);
>
> (comment) This is flushing out pages allocated in the previous transaction
> to keep consistent semantics in case of a crash loses the delete... There
> is some possibility for optimization here - comments below at
> journal_begin_ordered_truncate().
>
> > @@ -1487,46 +1462,49 @@ static int ext3_ordered_writepage(struct page *page,
> > + if (!wbc->skip_unmapped) {
> > + handle = ext3_journal_start(inode,
> > + ext3_writepage_trans_blocks(inode));
> > + if (IS_ERR(handle)) {
> > + ret = PTR_ERR(handle);
> > + goto out_fail;
> > + }
> > }
>
> I guess the common case here for delalloc is that if someone cares to fsync
> the file then that inode would be added to the journal list and the pages
> would be mapped here in the process context and flushed with the journal
> commit. This is ideal because it avoids syncing out all of the dirty pages
> for inodes that were NOT fsync'd and fixes the "one process doing fsync
> kills performance for streaming writes" problem in ordered mode that was
> recently discussed on the list.
>
> We in fact win twice because fsync_page_range() will potentially only
> map and flush the range of pages that the application cares about and
> does not necessarily have to write out all pages (though that will still
> happen in ordered mode if the pages had previously been mapped).
>
> > + else if (!PageMappedToDisk(page))
> > + goto out_fail;
>
> (style) } else if (...) {
> goto out_fail;
> }
>
> > + /* This can go as soon as someone cares about that ;) */
> > if (!page_has_buffers(page)) {
> > create_empty_buffers(page, inode->i_sb->s_blocksize,
> > (1 << BH_Dirty)|(1 << BH_Uptodate));
> > }
>
> Is there any reason to keep this in the non-data-journal case? Adding
> buffer heads to every page is a non-trivial amount of RAM usage.
>
> > @@ -2989,7 +2973,14 @@ int ext3_write_inode(struct inode *inode, int wait)
> > * be freed, so we have a strong guarantee that no future commit will
> > * leave these blocks visible to the user.)
> > *
> > - * Called with inode->sem down.
> > + * Another thing we have to asure is that if we are in ordered mode
>
> s/asure/assure/
>
> > + * and inode is still attached to the committing transaction, we must
> > + * we start writeout of all the dirty buffers which are being truncated.
>
> s/buffers/pages/
>
> > @@ -3032,6 +3023,13 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
> > attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
> > handle_t *handle;
> >
> > + if (ext3_should_order_data(inode)) {
> > + error = ext3_begin_ordered_truncate(inode,
> > + attr->ia_size);
>
> (style) align "attr->ia_size" with previous '('.
>
> > @@ -520,6 +520,8 @@ static void ext3_clear_inode(struct inode *inode)
> > EXT3_I(inode)->i_block_alloc_info = NULL;
> > if (unlikely(rsv))
> > kfree(rsv);
> > + journal_release_jbd_inode(EXT3_SB(inode->i_sb)->s_journal,
> > + &EXT3_I(inode)->jinode);
>
> (style) align with '(' on previous line.
>
> > /*
> > - * When an ext3-ordered file is truncated, it is possible that many pages are
> > - * not sucessfully freed, because they are attached to a committing transaction.
> > + * When an ext3 file is truncated, it is possible that some pages are not
> > + * sucessfully freed, because they are attached to a committing transaction.
>
> s/sucessfully/successfully/
>
> > -static int inverted_lock(journal_t *journal, struct buffer_head *bh)
> > -{
> > - if (!jbd_trylock_bh_state(bh)) {
> > - spin_unlock(&journal->j_list_lock);
> > - schedule();
> > - return 0;
> > - }
> > - return 1;
> > -}
>
> Always nice to see unpleasant code like this be removed.
>
> > +static int journal_submit_data_buffers(journal_t *journal,
> > + transaction_t *commit_transaction)
> > {
> > + /*
> > + * We are in a committing transaction. Therefore no new inode
> > + * can be added to our inode list. We use JI_COMMIT_RUNNING
> > + * flag to protect inode we currently operate on from being
> > + * released while we write out pages.
> > + */
>
> Should we J_ASSERT() this is true? Probably better to put this comment
> before the function instead of inside it.
>
> > + spin_lock(&journal->j_list_lock);
> > + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
> > + mapping = jinode->i_vfs_inode->i_mapping;
> > + if (!mapping_cap_writeback_dirty(mapping))
> > + continue;
> > + wbc.nr_to_write = mapping->nrpages * 2;
> > + jinode->i_flags |= JI_COMMIT_RUNNING;
> > + spin_unlock(&journal->j_list_lock);
> > + err = do_writepages(jinode->i_vfs_inode->i_mapping, &wbc);
> > + if (!ret)
> > + ret = err;
> > + spin_lock(&journal->j_list_lock);
> > + jinode->i_flags &= ~JI_COMMIT_RUNNING;
> > + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
> > }
> > + spin_unlock(&journal->j_list_lock);
>
> Hmm, this is one area I'm a bit worried about. In the old code the number
> of buffers to sync for a transaction are bounded because they can only be
> added to the transaction while it is open. With this new inode-list code
> there could be a process busily adding new dirty + mapped pages to the
> inode(s) in the running transaction, while we are blocked here writing
> out the pages in kjournald trying to commit the previous transaction.
>
> At some point we will eventually no longer be able to add new pages to the
> running transaction (because it is full or was closed due to timeout) and
> we won't be able to start a new transaction because the committing transaction
> has not yet committed. At that point we would be able to finish the commit
> of the previous transaction, but not until we had reached the point of
> blocking all operations in the filesystem waiting for the new transaction
> to open. This would essentially lead to livelock of the system, with
> stop-start-stop cycles for the transactions and IO.
>
> What would be needed is some kind of mechanism to separate the dirty pages
> on the inode into "dirty from current transaction" and "dirty from the
> previous transaction". That seems non-trivial, however. I guess we could
> also force any process doing writing to flush out all of the dirty and
> mapped pages on the inode if it detects the inode is in two transactions.
>
> This would at least parallelize the IO submission and also throttle the
> addition of new pages until the dirty ones on the committing transaction
> had been flushed but may essentially mean sync IO performance for streaming
> writes on large files without delalloc (which would not add new pages that
> are mapped normally).
>
> > +static int journal_finish_data_buffers(journal_t *journal,
> > + transaction_t *commit_transaction)
> > {
> > + /* Now refile inode to proper lists */
> > + list_for_each_entry_safe(jinode, next_i,
> > + &commit_transaction->t_inode_list, i_list) {
>
> (style) align with '(' on previous line
>
> > + list_del(&jinode->i_list);
> > + if (jinode->i_next_transaction) {
> > + jinode->i_transaction = jinode->i_next_transaction;
> > + jinode->i_next_transaction = NULL;
> > + list_add(&jinode->i_list,
> > + &jinode->i_transaction->t_inode_list);
> > }
> > + else
> > + jinode->i_transaction = NULL;
>
> (style) } else {
> jinode->i_transaction = NULL;
> }
>
> > @@ -655,7 +565,14 @@ start_journal_io:
> > so we incur less scheduling load.
> > */
> >
> > - jbd_debug(3, "JBD: commit phase 4\n");
> > + jbd_debug(3, "JBD: commit phase 3\n");
>
> Rather than numbering these phases, which has little meaning and often
> means they just get renumbered like here, it is probably more useful to
> give them descriptive names like "JBD: commit wait for data buffers", etc.
>
> > @@ -1504,10 +1327,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
> > * Remove a buffer from the appropriate transaction list.
> > *
> > * Note that this function can *change* the value of
> > - * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
> > - * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
> > - * is holding onto a copy of one of thee pointers, it could go bad.
> > - * Generally the caller needs to re-read the pointer from the transaction_t.
> > + * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
> > + * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
> > + * of thee pointers, it could go bad. Generally the caller needs to re-read
>
> s/thee/these/
>
> > +/*
> > + * File inode in the inode list of the handle's transaction
> > + */
> > +int journal_file_inode(handle_t *handle, struct jbd_inode *jinode)
> > +{
> > + transaction_t *transaction = handle->h_transaction;
> > + journal_t *journal = transaction->t_journal;
> > +
> > + if (is_handle_aborted(handle))
> > + return 0;
>
> Should we be returning an error back to the caller at this point?
>
> > + jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
> > + transaction->t_tid);
> > +
> > + spin_lock(&journal->j_list_lock);
> > +
> > + if (jinode->i_transaction == transaction ||
> > + jinode->i_next_transaction == transaction)
> > + goto done;
>
> Is it possible/safe to do the above check outside of the j_list_lock
> as an optimization? We will be calling this function for every page
> that is dirtied and it will likely be quite highly contended. It used
> to be that we HAD to get this lock because we were almost certainly
> adding the new buffers to the transaction list, but in this updated
> code the common case is that the inode will already be on the list...
>
> > +/*
> > + * This function must be called when inode is journaled in ordered mode
> > + * before truncation happens. It starts writeout of truncated part in
> > + * case it is in the committing transaction so that we stand to ordered
> > + * mode consistency guarantees.
> > + */
> > +int journal_begin_ordered_truncate(struct jbd_inode *inode, loff_t new_size)
> > +{
> > + journal_t *journal;
> > + transaction_t *commit_trans;
> > + int ret = 0;
> > +
> > + if (!inode->i_transaction && !inode->i_next_transaction)
> > + goto out;
> > + journal = inode->i_transaction->t_journal;
> > + spin_lock(&journal->j_state_lock);
> > + commit_trans = journal->j_committing_transaction;
> > + spin_unlock(&journal->j_state_lock);
> > + if (inode->i_transaction == commit_trans) {
> > + ret = __filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
> > + new_size, LLONG_MAX, WB_SYNC_ALL);
> > + if (ret)
> > + journal_abort(journal, ret);
> > + }
>
> Is there any way here to entirely avoid writing blocks which were just
> allocated during the current transaction (e.g. temp files during compile
> or whatever)? One possibility is to store the transaction number when
> the inode was created (or first dirtied?) in ext3_inode_info and if that is
> equal to the committing transaction then we don't need to write anything
> at all?
>
> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Mar 07, 2008 at 04:52:10PM -0700, Andreas Dilger wrote:
> I'm looking at what implications this has for delayed allocation in ext4,
> because the vast majority of file data will be unmapped in that case
> and a journal commit in ordered mode will no longer cause the data to
> be flushed to disk.
The buffers shouldn't be unmapped. They are accounted for and doing
the delalloc conversion is easier than really allocating blocks for
truely unmapped blocks. You should probably reuse BH_Delay for that
as it has all the right handling in buffer.c in place due to XFS.
Also on any filesystem with ->page_mkwrite implemented unmapped buffers
should be entirely gone because we now have the proper early
reservation / allocation infrastructure in place.
On Thu 06-03-08 14:05:03, Josef Bacik wrote:
> On Thursday 06 March 2008 12:42:09 pm Jan Kara wrote:
> > Hi,
> >
> > Below is my rewrite of ordered mode in JBD. Now we don't have a list of
> > data buffers that need syncing on transaction commit but a list of inodes
> > that need writeout during commit. This brings all sorts of advantages such
> > as possibility to get rid of journal heads and buffer heads for data
> > buffers in ordered mode, better ordering of writes on transaction commit,
> > simplification of some JBD code, no more anonymous pages when truncate of
> > data being committed happens. The patch has survived some light testing
> > but it still has some potential of eating your data so beware :) I've run
> > dbench to see whether we didn't decrease performance by different handling
> > of truncate and the throughput I'm getting on my machine is the same (OK,
> > is lower by 0.5%) if I disable the code in truncate waiting for commit to
> > finish... Also the throughput of dbench is about 2% better with my patch
> > than with current JBD.
> > Any comments or testing most welcome.
> >
> > Honza
>
> Just one nit, doesn't compile properly when jbd/ext3 are modules :). Thanks
> much,
Thanks for spotting this. Will fix :).
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 06-03-08 15:53:01, Andrew Morton wrote:
> On Thu, 6 Mar 2008 18:42:09 +0100
> Jan Kara <[email protected]> wrote:
>
> > Below is my rewrite of ordered mode in JBD. Now we don't have a list of
> > data buffers that need syncing on transaction commit but a list of inodes
> > that need writeout during commit. This brings all sorts of advantages such
> > as possibility to get rid of journal heads and buffer heads for data
> > buffers in ordered mode, better ordering of writes on transaction commit,
> > simplification of some JBD code, no more anonymous pages when truncate of
> > data being committed happens. The patch has survived some light testing
> > but it still has some potential of eating your data so beware :) I've run
> > dbench to see whether we didn't decrease performance by different handling
> > of truncate and the throughput I'm getting on my machine is the same (OK,
> > is lower by 0.5%) if I disable the code in truncate waiting for commit to
> > finish... Also the throughput of dbench is about 2% better with my patch
> > than with current JBD.
> > Any comments or testing most welcome.
>
> Thanks for plugging away with this.
>
> Please change your patch preparation tools to always always include a
> diffstat, OK?
Hmm, I mostly submit patches by hand but I'll try to not forget generate
patches with git-diff --stat ;).
> fs/buffer.c | 3
> fs/ext3/ialloc.c | 1
> fs/ext3/inode.c | 118 +++++++++---------
> fs/ext3/super.c | 2
> fs/jbd/checkpoint.c | 1
> fs/jbd/commit.c | 257 +++++++++++++----------------------------
> fs/jbd/journal.c | 45 +++++++
> fs/jbd/transaction.c | 288 +++++++++++-----------------------------------
> fs/mpage.c | 5
> include/linux/ext3_fs.h | 1
> include/linux/ext3_fs_i.h | 1
> include/linux/jbd.h | 70 +++++++----
> include/linux/writeback.h | 2
> 13 files changed, 326 insertions(+), 468 deletions(-)
>
> Would it make sense to turn this patch into a patch series sometime?
We can definitely split out ext3 and JBD changes (although ext3 would not
compile after JBD changes). I'll have a look at what Mingming suggests -
whether we could make both modes coexist reasonably easily. In that case also
patches could change smaller chunks.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 06-03-08 17:34:27, Mark Fasheh wrote:
> On Thu, Mar 06, 2008 at 06:42:09PM +0100, Jan Kara wrote:
> > Below is my rewrite of ordered mode in JBD. Now we don't have a list of
> > data buffers that need syncing on transaction commit but a list of inodes
> > that need writeout during commit. This brings all sorts of advantages such
> > as possibility to get rid of journal heads and buffer heads for data
> > buffers in ordered mode, better ordering of writes on transaction commit,
> > simplification of some JBD code, no more anonymous pages when truncate of
> > data being committed happens.
>
> There's a lot of JBD code which gets removed by this patch - cool :)
>
>
> > The patch has survived some light testing but it still has some potential
> > of eating your data so beware :)
>
> Looking through the patch, I don't see how you solve the page lock /
> transaction ordering issues. I see that you avoid starting a handle in
> ->writepage during transaction commit, but what about another process which
> starts a handle under page lock and needs to wait for transactions to be
> written out before continuing?
Ho, hum, drat. I think you're right. And I don't see a way around this
besides reversing the order of page_lock and transaction start in the whole
ext3/4 :(... I'll think how we could solve this problem but I'm afraid
the rewrite is the easiest one.
> > I've run dbench to see whether we didn't decrease performance by different
> > handling of truncate and the throughput I'm getting on my machine is the
> > same (OK, is lower by 0.5%) if I disable the code in truncate waiting for
> > commit to finish...
> > Also the throughput of dbench is about 2% better with my patch than with
> > current JBD.
> > Any comments or testing most welcome.
>
> My attempt at helpful review follows.
>
>
> > @@ -1465,15 +1444,11 @@ static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
> > * We don't honour synchronous mounts for writepage(). That would be
> > * disastrous. Any write() or metadata operation will sync the fs for
> > * us.
> > - *
> > - * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
> > - * we don't need to open a transaction here.
> > */
> > static int ext3_ordered_writepage(struct page *page,
> > struct writeback_control *wbc)
> > {
> > struct inode *inode = page->mapping->host;
> > - struct buffer_head *page_bufs;
> > handle_t *handle = NULL;
> > int ret = 0;
> > int err;
> > @@ -1487,46 +1462,49 @@ static int ext3_ordered_writepage(struct page *page,
> > if (ext3_journal_current_handle())
> > goto out_fail;
> >
> > - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
> > -
> > - if (IS_ERR(handle)) {
> > - ret = PTR_ERR(handle);
> > - goto out_fail;
> > + /*
> > + * Now there are two different reasons why we can be called:
> > + * 1) write out during commit
> > + * 2) fsync / writeout to free memory
> > + *
> > + * In the first case, we just need to write the buffer to disk, in the
> > + * second case we may need to do hole filling and attach the inode to
> > + * the transaction. Note that even in the first case, we may get an
> > + * unmapped buffer (hole fill with data via mmap) but we don't have to
> > + * write it - actually, we can't because from a transaction commit we
> > + * cannot start a new transaction or we could deadlock.
>
> What happens to the data if we get here under the 1st case with a hole? Do
> we eventually fill the hole (with correct data) via some mechanism I don't
> see here?
Yes, eventually pdflush will find the page, call writepage() and that
will fill the hole.
> <snip>
>
>
> > +/*
> > + * This function must be called when inode is journaled in ordered mode
> > + * before truncation happens. It starts writeout of truncated part in
> > + * case it is in the committing transaction so that we stand to ordered
> > + * mode consistency guarantees.
> > + */
> > +int journal_begin_ordered_truncate(struct jbd_inode *inode, loff_t new_size)
> > +{
> > + journal_t *journal;
> > + transaction_t *commit_trans;
> > + int ret = 0;
> > +
> > + if (!inode->i_transaction && !inode->i_next_transaction)
> > + goto out;
> > + journal = inode->i_transaction->t_journal;
> > + spin_lock(&journal->j_state_lock);
> > + commit_trans = journal->j_committing_transaction;
> > + spin_unlock(&journal->j_state_lock);
> > + if (inode->i_transaction == commit_trans) {
>
> AFAICT, this is called in ext3 before a handle is started for truncate. Is
> it possible for the current running transaction to become the new committing
> transaction shortly after the spinlock is dropped, but before the truncate
> transaction starts? Could this lead to ordered data not being written out if
> the inode is part of the current running transaction but not part of the
> committing transaction?
Good catch. Actually, the problem isn't in
journal_begin_ordered_truncate() but you are right that we should call this
function only after we add the inode to the orphan list. That way we know
that if the current running transaction commits, the inode will be
truncated at least during journal replay and thus we are safe from the
races you describe above. Fixed.
> > + ret = __filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
> > + new_size, LLONG_MAX, WB_SYNC_ALL);
> > + if (ret)
> > + journal_abort(journal, ret);
> > + }
> > +out:
> > + return ret;
> > +}
> > diff --git a/fs/mpage.c b/fs/mpage.c
> > index 235e4d3..4f66bae 100644
> > --- a/fs/mpage.c
> > +++ b/fs/mpage.c
> > @@ -527,7 +527,10 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
> >
> > map_bh.b_state = 0;
> > map_bh.b_size = 1 << blkbits;
> > - if (mpd->get_block(inode, block_in_file, &map_bh, 1))
> > + if (mpd->get_block(inode, block_in_file, &map_bh,
> > + !wbc->skip_unmapped))
> > + goto confused;
> > + if (!buffer_mapped(&map_bh))
> > goto confused;
> > if (buffer_new(&map_bh))
> > unmap_underlying_metadata(map_bh.b_bdev,
> > diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
> > index 36c5403..7aa8327 100644
> > --- a/include/linux/ext3_fs.h
> > +++ b/include/linux/ext3_fs.h
> > @@ -826,6 +826,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
> > extern struct inode *ext3_iget(struct super_block *, unsigned long);
> > extern int ext3_write_inode (struct inode *, int);
> > extern int ext3_setattr (struct dentry *, struct iattr *);
> > +extern void ext3_drop_inode (struct inode *);
>
> Not sure what this is here for...
Removed - it was left there from the previous version of the patch.
Thanks for a really helpful review.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
Hi Mingming,
On Fri 07-03-08 02:55:01, Mingming Cao wrote:
> On Thu, 2008-03-06 at 18:42 +0100, Jan Kara wrote:
> > Below is my rewrite of ordered mode in JBD. Now we don't have a list of
> > data buffers that need syncing on transaction commit but a list of inodes
> > that need writeout during commit. This brings all sorts of advantages such
> > as possibility to get rid of journal heads and buffer heads for data
> > buffers in ordered mode, better ordering of writes on transaction commit,
> > simplification of some JBD code, no more anonymous pages when truncate of
> > data being committed happens. The patch has survived some light testing
> > but it still has some potential of eating your data so beware :) I've run
> > dbench to see whether we didn't decrease performance by different handling
> > of truncate and the throughput I'm getting on my machine is the same (OK,
> > is lower by 0.5%) if I disable the code in truncate waiting for commit to
> > finish... Also the throughput of dbench is about 2% better with my patch
> > than with current JBD.
>
> I know ext4 is keep changing that it's a bit hard to create patch
> against ext4, but I feel features like especially rewrite the default
> ordered mode should done in ext4/jbd2. I could port to current ext4 and
> JBD2 if you agrees with this.
I definitely agree with you :). This was just the first version of the
patch, more like a proof-of-concept, and it was easier to code against
jbd/ext3 for me :) But when I have something more definitive, I'll port it
against jbd2/ext4.
> Also, would it make sense to create a new ordered mode writepage
> routines, and keep the old ordered mode code there for a while, to allow
> easy comparison? This could a good transition for people to start
> experiment this ordered mode without worrying about put data in danger
> by default.
This is an interesting idea. I'll have a look how hard would it be to do
that. It would be also good because we could separate the patch into
several chunks and it would still compile / work between them.
> > Any comments or testing most welcom
>
> [...]
> > /*
> > * Note that we always start a transaction even if we're not journalling
> > * data. This is to preserve ordering: any hole instantiation within
> > @@ -1465,15 +1444,11 @@ static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
> > * We don't honour synchronous mounts for writepage(). That would be
> > * disastrous. Any write() or metadata operation will sync the fs for
> > * us.
> > - *
> > - * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
> > - * we don't need to open a transaction here.
> > */
> > static int ext3_ordered_writepage(struct page *page,
> > struct writeback_control *wbc)
> > {
> > struct inode *inode = page->mapping->host;
> > - struct buffer_head *page_bufs;
> > handle_t *handle = NULL;
> > int ret = 0;
> > int err;
> > @@ -1487,46 +1462,49 @@ static int ext3_ordered_writepage(struct page *page,
> > if (ext3_journal_current_handle())
> > goto out_fail;
> >
> > - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
> > -
> > - if (IS_ERR(handle)) {
> > - ret = PTR_ERR(handle);
> > - goto out_fail;
> > + /*
> > + * Now there are two different reasons why we can be called:
> > + * 1) write out during commit
> > + * 2) fsync / writeout to free memory
> > + *
> > + * In the first case, we just need to write the buffer to disk, in the
> > + * second case we may need to do hole filling and attach the inode to
> > + * the transaction. Note that even in the first case, we may get an
> > + * unmapped buffer (hole fill with data via mmap) but we don't have to
> > + * write it - actually, we can't because from a transaction commit we
> > + * cannot start a new transaction or we could deadlock.
> > + */
>
> Any thoughts how to handle the unmapped page under case 1)? Right now I
> see it fails. Your comments here saying that we still have the issue
> that "can't start a new transaction while commiting", but likely, with
> delayed allocation, starting a new transaction could to happen a lot to
> do defered block allocation.
>
> I really hope this new mode could be easy to add delayed allocation
> support. Any thoughts that we could workaround the locking in the JBD2
> layer?
"can't start a new transaction while commiting" is a fundamental problem
that you cannot add to a journal when you are trying to clean it up. It's
not just a locking problem :). As Mark pointed out, there's another problem
with the fact that we cannot afford to take page_lock while committing
because that's essentially a clasical lock inversion between a transaction
start and a page lock. How to solve that one, I don't know yet.
To your question about delayed allocation - currently, we just skip those
buffers so as you write in some other email, you basically get the
guarantees of writeback mode. Actually, I guess the result is the same with
the old way of ordered-mode handling since you cannot afford to do the block
allocation from the commit code as well... We could get around this
limitation (actually easier than in the old code) if we were sure we have
enough credits in the transaction to really do the allocation - we would
do the allocation in the writepage() and attach changed buffers to the
committing transaction.
> > + if (!wbc->skip_unmapped) {
> > + handle = ext3_journal_start(inode,
> > + ext3_writepage_trans_blocks(inode));
> > + if (IS_ERR(handle)) {
> > + ret = PTR_ERR(handle);
> > + goto out_fail;
> > + }
> > }
> > + else if (!PageMappedToDisk(page))
> > + goto out_fail;
> >
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Fri 07-03-08 16:52:10, Andreas Dilger wrote:
> On Mar 06, 2008 18:42 +0100, Jan Kara wrote:
> > Below is my rewrite of ordered mode in JBD. Now we don't have a list of
> > data buffers that need syncing on transaction commit but a list of inodes
> > that need writeout during commit. This brings all sorts of advantages such
> > as possibility to get rid of journal heads and buffer heads for data
> > buffers in ordered mode, better ordering of writes on transaction commit,
> > simplification of some JBD code, no more anonymous pages when truncate of
> > data being committed happens. The patch has survived some light testing
> > but it still has some potential of eating your data so beware :) I've run
> > dbench to see whether we didn't decrease performance by different handling
> > of truncate and the throughput I'm getting on my machine is the same (OK,
> > is lower by 0.5%) if I disable the code in truncate waiting for commit to
> > finish... Also the throughput of dbench is about 2% better with my patch
> > than with current JBD.
> > Any comments or testing most welcome.
>
> Looks like a very good patch - thanks for your effort in moving this
> beyond the "hand-waving" stage that it's been in for the past few years.
>
> I'm looking at what implications this has for delayed allocation in ext4,
> because the vast majority of file data will be unmapped in that case
> and a journal commit in ordered mode will no longer cause the data to
> be flushed to disk.
>
> I _think_ is OK, because the pdflushd will now be totally in charge of
> flushing the dirty pages to disk, instead of this previously being done
> by ordered mode in the journal. I know there have been some bugs in this
> area in the past, but I guess it isn't much different than running in
> writeback mode. That said, I don't know how many users run in writeback
> mode unless they are running a database, and the database is doing a lot
> of explicit fsync of file data so there may still be bugs lurking...
Yes, they basically get guarantees as in writeback mode. As I wrote to
Mingming, I'm not sure how that is handled with current ordered-mode
handling because you cannot afford to do block allocation from the commit
code anyway. We could possibly do the allocation if we were sure that we
have enough credits in the transaction (but that currently isn't the case
since we really count the number of buffers dirtied on the account of
transaction handle and after the handle is dropped, we give back unused
credits to the transaction).
> Some care is still needed here because with ext4 delayed allocation the
> common case will be unmapped buffers, while the ext3 implementation will
> only have this in very rare cases (unmapped mmap writes), but it looks
> like the right mechanism is already in place for both.
>
> I'll just put a bunch of comments inline, not necessarily problems, just
> walking through the code to ensure I understand what is going on...
>
> > @@ -1675,7 +1675,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
> > */
> > clear_buffer_dirty(bh);
> > set_buffer_uptodate(bh);
> > - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
> > + } else if (!buffer_mapped(bh) && buffer_dirty(bh)
> > + && !wbc->skip_unmapped) {
>
> (comment) This is skipping writeout of unmapped pages during journal commit -
> good, otherwise we would deadlock trying to add block mappings...
>
> > @@ -183,6 +184,8 @@ void ext3_delete_inode (struct inode * inode)
> > {
> > handle_t *handle;
> >
> > + if (ext3_should_order_data(inode))
> > + ext3_begin_ordered_truncate(inode, 0);
>
> (comment) This is flushing out pages allocated in the previous transaction
> to keep consistent semantics in case of a crash loses the delete... There
> is some possibility for optimization here - comments below at
> journal_begin_ordered_truncate().
>
> > @@ -1487,46 +1462,49 @@ static int ext3_ordered_writepage(struct page *page,
> > + if (!wbc->skip_unmapped) {
> > + handle = ext3_journal_start(inode,
> > + ext3_writepage_trans_blocks(inode));
> > + if (IS_ERR(handle)) {
> > + ret = PTR_ERR(handle);
> > + goto out_fail;
> > + }
> > }
>
> I guess the common case here for delalloc is that if someone cares to fsync
> the file then that inode would be added to the journal list and the pages
> would be mapped here in the process context and flushed with the journal
> commit. This is ideal because it avoids syncing out all of the dirty pages
> for inodes that were NOT fsync'd and fixes the "one process doing fsync
> kills performance for streaming writes" problem in ordered mode that was
> recently discussed on the list.
>
> We in fact win twice because fsync_page_range() will potentially only
> map and flush the range of pages that the application cares about and
> does not necessarily have to write out all pages (though that will still
> happen in ordered mode if the pages had previously been mapped).
>
> > + else if (!PageMappedToDisk(page))
> > + goto out_fail;
>
> (style) } else if (...) {
> goto out_fail;
> }
>
> > + /* This can go as soon as someone cares about that ;) */
> > if (!page_has_buffers(page)) {
> > create_empty_buffers(page, inode->i_sb->s_blocksize,
> > (1 << BH_Dirty)|(1 << BH_Uptodate));
> > }
>
> Is there any reason to keep this in the non-data-journal case? Adding
> buffer heads to every page is a non-trivial amount of RAM usage.
No. But I think that it's better to do nobh ordered mode in a separate
patch. This one is complicated enough...
> > +static int journal_submit_data_buffers(journal_t *journal,
> > + transaction_t *commit_transaction)
> > {
> > + /*
> > + * We are in a committing transaction. Therefore no new inode
> > + * can be added to our inode list. We use JI_COMMIT_RUNNING
> > + * flag to protect inode we currently operate on from being
> > + * released while we write out pages.
> > + */
>
> Should we J_ASSERT() this is true? Probably better to put this comment
> before the function instead of inside it.
Comment moved. How would you like to assert for inode being freed? Maybe
we could check jinode->i_transaction == commit_transaction. That should be
good enough.
> > + spin_lock(&journal->j_list_lock);
> > + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
> > + mapping = jinode->i_vfs_inode->i_mapping;
> > + if (!mapping_cap_writeback_dirty(mapping))
> > + continue;
> > + wbc.nr_to_write = mapping->nrpages * 2;
> > + jinode->i_flags |= JI_COMMIT_RUNNING;
> > + spin_unlock(&journal->j_list_lock);
> > + err = do_writepages(jinode->i_vfs_inode->i_mapping, &wbc);
> > + if (!ret)
> > + ret = err;
> > + spin_lock(&journal->j_list_lock);
> > + jinode->i_flags &= ~JI_COMMIT_RUNNING;
> > + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
> > }
> > + spin_unlock(&journal->j_list_lock);
>
> Hmm, this is one area I'm a bit worried about. In the old code the number
> of buffers to sync for a transaction are bounded because they can only be
> added to the transaction while it is open. With this new inode-list code
> there could be a process busily adding new dirty + mapped pages to the
> inode(s) in the running transaction, while we are blocked here writing
> out the pages in kjournald trying to commit the previous transaction.
Correct me if I'm wrong but as far as I look into the write-out code, we
do just one-sweep scan of mapping when writing out dirty pages. So we could
possibly write out more than we need but at most the whole file. Oh, ok,
someone could be extending the file as well but we could fix that by
setting .range_end to the current i_size - that's actually a nice
optimization anyway so I've done that.
> At some point we will eventually no longer be able to add new pages to the
> running transaction (because it is full or was closed due to timeout) and
> we won't be able to start a new transaction because the committing transaction
> has not yet committed. At that point we would be able to finish the commit
> of the previous transaction, but not until we had reached the point of
> blocking all operations in the filesystem waiting for the new transaction
> to open. This would essentially lead to livelock of the system, with
> stop-start-stop cycles for the transactions and IO.
>
> What would be needed is some kind of mechanism to separate the dirty pages
> on the inode into "dirty from current transaction" and "dirty from the
> previous transaction". That seems non-trivial, however. I guess we could
> also force any process doing writing to flush out all of the dirty and
> mapped pages on the inode if it detects the inode is in two transactions.
>
> This would at least parallelize the IO submission and also throttle the
> addition of new pages until the dirty ones on the committing transaction
> had been flushed but may essentially mean sync IO performance for streaming
> writes on large files without delalloc (which would not add new pages that
> are mapped normally).
>
> > @@ -655,7 +565,14 @@ start_journal_io:
> > so we incur less scheduling load.
> > */
> >
> > - jbd_debug(3, "JBD: commit phase 4\n");
> > + jbd_debug(3, "JBD: commit phase 3\n");
>
> Rather than numbering these phases, which has little meaning and often
> means they just get renumbered like here, it is probably more useful to
> give them descriptive names like "JBD: commit wait for data buffers", etc.
Yes, but I guess that's for a separate patch. I'll add it to my todo :).
> > +/*
> > + * File inode in the inode list of the handle's transaction
> > + */
> > +int journal_file_inode(handle_t *handle, struct jbd_inode *jinode)
> > +{
> > + transaction_t *transaction = handle->h_transaction;
> > + journal_t *journal = transaction->t_journal;
> > +
> > + if (is_handle_aborted(handle))
> > + return 0;
>
> Should we be returning an error back to the caller at this point?
Yes, of course.
> > + jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
> > + transaction->t_tid);
> > +
> > + spin_lock(&journal->j_list_lock);
> > +
> > + if (jinode->i_transaction == transaction ||
> > + jinode->i_next_transaction == transaction)
> > + goto done;
>
> Is it possible/safe to do the above check outside of the j_list_lock
> as an optimization? We will be calling this function for every page
> that is dirtied and it will likely be quite highly contended. It used
> to be that we HAD to get this lock because we were almost certainly
> adding the new buffers to the transaction list, but in this updated
> code the common case is that the inode will already be on the list...
Probably we could do that - there are two places where we remove inode
from a transaction list (and these are the only ones interesting for
races). One is the commit code and the other one is
journal_release_jbd_inode(). We are safe from the second one because we
obviously hold a reference to the inode. The first one is more subtle but
if jinode->i_transaction == transaction, it is filed on the running
transaction's list from which we have handle and so commit code cannot
touch it. If jinode->i_next_transaction == transaction, then commit code
can change inode under us but it will file the inode to the running
transaction's list anyway so there's nothing to lose.
Changed the code and added a big comment...
> > +/*
> > + * This function must be called when inode is journaled in ordered mode
> > + * before truncation happens. It starts writeout of truncated part in
> > + * case it is in the committing transaction so that we stand to ordered
> > + * mode consistency guarantees.
> > + */
> > +int journal_begin_ordered_truncate(struct jbd_inode *inode, loff_t new_size)
> > +{
> > + journal_t *journal;
> > + transaction_t *commit_trans;
> > + int ret = 0;
> > +
> > + if (!inode->i_transaction && !inode->i_next_transaction)
> > + goto out;
> > + journal = inode->i_transaction->t_journal;
> > + spin_lock(&journal->j_state_lock);
> > + commit_trans = journal->j_committing_transaction;
> > + spin_unlock(&journal->j_state_lock);
> > + if (inode->i_transaction == commit_trans) {
> > + ret = __filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
> > + new_size, LLONG_MAX, WB_SYNC_ALL);
> > + if (ret)
> > + journal_abort(journal, ret);
> > + }
>
> Is there any way here to entirely avoid writing blocks which were just
> allocated during the current transaction (e.g. temp files during compile
> or whatever)? One possibility is to store the transaction number when
> the inode was created (or first dirtied?) in ext3_inode_info and if that is
> equal to the committing transaction then we don't need to write anything
> at all?
Note that we write out blocks only if we truncate inode that is being
currently committed. I.e., it has been created in the previous transaction
and is truncated in the current one. Or did you mean something else?
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mar 10, 2008 20:54 +0100, Jan Kara wrote:
> On Fri 07-03-08 16:52:10, Andreas Dilger wrote:
> > I'm looking at what implications this has for delayed allocation in ext4,
> > because the vast majority of file data will be unmapped in that case
> > and a journal commit in ordered mode will no longer cause the data to
> > be flushed to disk.
> >
> > I _think_ is OK, because the pdflushd will now be totally in charge of
> > flushing the dirty pages to disk, instead of this previously being done
> > by ordered mode in the journal. I know there have been some bugs in this
> > area in the past, but I guess it isn't much different than running in
> > writeback mode. That said, I don't know how many users run in writeback
> > mode unless they are running a database, and the database is doing a lot
> > of explicit fsync of file data so there may still be bugs lurking...
>
> Yes, they basically get guarantees as in writeback mode. As I wrote to
> Mingming, I'm not sure how that is handled with current ordered-mode
> handling because you cannot afford to do block allocation from the commit
> code anyway. We could possibly do the allocation if we were sure that we
> have enough credits in the transaction (but that currently isn't the case
> since we really count the number of buffers dirtied on the account of
> transaction handle and after the handle is dropped, we give back unused
> credits to the transaction).
Actually, I think this is the same semantics as the existing ordered mode,
as long as the on-disk inode is only changed as part of the transaction
that is allocating the pages (which is needed for correctness in any case).
The noticable difference will be that the new ordered mode will not force
out pending dirty pages, if it is run with delalloc. This would be a big
improvement, because users of fsync() on a filesystem with another streaming
IO process have terrible latency problems with the current code. So not
only does your patch reduce complexity, it allows ext4 delalloc to work
in ordered mode, and improves overall responsiveness as well.
I think in ext4 it makes sense to always be running in delalloc mode once
your patch is landed, as delalloc has improved performance greatly, and
one of the few reasons we haven't submitted it upstream was the lack of
ordered mode support.
As far as patching ext3/jbd vs. ext4/jbd2, I think it makes a lot more
sense to test this patch with ext4 first in -mm because people will not
expect as much stability with ext4 vs. ext3 and this will allow some time
to get testing on your changes, which affect a very core part of ext3/jbd.
> > Hmm, this is one area I'm a bit worried about. In the old code the number
> > of buffers to sync for a transaction are bounded because they can only be
> > added to the transaction while it is open. With this new inode-list code
> > there could be a process busily adding new dirty + mapped pages to the
> > inode(s) in the running transaction, while we are blocked here writing
> > out the pages in kjournald trying to commit the previous transaction.
>
> Correct me if I'm wrong but as far as I look into the write-out code, we
> do just one-sweep scan of mapping when writing out dirty pages. So we could
> possibly write out more than we need but at most the whole file. Oh, ok,
> someone could be extending the file as well but we could fix that by
> setting .range_end to the current i_size - that's actually a nice
> optimization anyway so I've done that.
Yes, the continually-growing file livelock is a problem that hit fsync in
the past, and we need to avoid it here also. Using .range_end is a good
way to avoid it, though it isn't 100% foolproof due to usage like:
ftruncate(fd, LARGE_SIZE);
write(fd, buf, LARGE_SIZE);
It will eventually end, but not for a long time.
> > > + spin_lock(&journal->j_list_lock);
> > > +
> > > + if (jinode->i_transaction == transaction ||
> > > + jinode->i_next_transaction == transaction)
> > > + goto done;
> >
> > Is it possible/safe to do the above check outside of the j_list_lock
> > as an optimization? We will be calling this function for every page
> > that is dirtied and it will likely be quite highly contended. It used
> > to be that we HAD to get this lock because we were almost certainly
> > adding the new buffers to the transaction list, but in this updated
> > code the common case is that the inode will already be on the list...
>
> Probably we could do that - there are two places where we remove inode
> from a transaction list (and these are the only ones interesting for
> races). One is the commit code and the other one is
> journal_release_jbd_inode(). We are safe from the second one because we
> obviously hold a reference to the inode. The first one is more subtle but
> if jinode->i_transaction == transaction, it is filed on the running
> transaction's list from which we have handle and so commit code cannot
> touch it. If jinode->i_next_transaction == transaction, then commit code
> can change inode under us but it will file the inode to the running
> transaction's list anyway so there's nothing to lose.
> Changed the code and added a big comment...
Great, thanks. Avoiding lock contention is a big issue on many-way SMP
systems that are becoming much more common these days.
> > > +int journal_begin_ordered_truncate(struct jbd_inode *inode, loff_t new_size)
> > > +{
> > > + journal_t *journal;
> > > + transaction_t *commit_trans;
> > > + int ret = 0;
> > > +
> > > + if (!inode->i_transaction && !inode->i_next_transaction)
> > > + goto out;
> > > +
> > > + journal = inode->i_transaction->t_journal;
> > > + spin_lock(&journal->j_state_lock);
> > > + commit_trans = journal->j_committing_transaction;
> > > + spin_unlock(&journal->j_state_lock);
> > > + if (inode->i_transaction == commit_trans) {
> > > + ret = __filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
> > > + new_size, LLONG_MAX, WB_SYNC_ALL);
> > > + if (ret)
> > > + journal_abort(journal, ret);
> > > + }
> >
> > Is there any way here to entirely avoid writing blocks which were just
> > allocated during the current transaction (e.g. temp files during compile
> > or whatever)? One possibility is to store the transaction number when
> > the inode was created (or first dirtied?) in ext3_inode_info and if that is
> > equal to the committing transaction then we don't need to write anything
> > at all?
>
> Note that we write out blocks only if we truncate inode that is being
> currently committed. I.e., it has been created in the previous transaction
> and is truncated in the current one. Or did you mean something else?
OK, it seems very few files will span the truncate boundary (for compiles
likely only num_cpus) and if they can be truncated within the same transaction
without an IO that is fine.
Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.
Hi,
While looking at a bug related to direct IO returns to EIO, after
looking at the code, I found there is a window that
try_to_free_buffers() from direct IO could race with JBD, which holds
the reference to the data buffers before journal_commit_transaction()
ensures the data buffers has reached to the disk.
A little more detail: to prepare for direct IO, generic_file_direct_IO()
calls invalidate_inode_pages2_range() to invalidate the pages in the
cache before performaning direct IO. invalidate_inode_pages2_range()
tries to free the buffers via try_to free_buffers(), but sometimes it
can't, due to the buffers is possible still on some transaction's
t_sync_datalist or t_locked_list waiting for
journal_commit_transaction() to process it.
Currently Direct IO simply returns EIO if try_to_free_buffers() finds
the buffer is busy, as it has no clue that JBD is referencing it.
Is this a known issue and expected behavior? Any thoughts?
Mingming
On Fri, 25 Apr 2008 16:38:23 -0700 Mingming Cao <[email protected]> wrote:
> Hi,
>
> While looking at a bug related to direct IO returns to EIO, after
> looking at the code, I found there is a window that
> try_to_free_buffers() from direct IO could race with JBD, which holds
> the reference to the data buffers before journal_commit_transaction()
> ensures the data buffers has reached to the disk.
>
> A little more detail: to prepare for direct IO, generic_file_direct_IO()
> calls invalidate_inode_pages2_range() to invalidate the pages in the
> cache before performaning direct IO. invalidate_inode_pages2_range()
> tries to free the buffers via try_to free_buffers(), but sometimes it
> can't, due to the buffers is possible still on some transaction's
> t_sync_datalist or t_locked_list waiting for
> journal_commit_transaction() to process it.
>
> Currently Direct IO simply returns EIO if try_to_free_buffers() finds
> the buffer is busy, as it has no clue that JBD is referencing it.
>
> Is this a known issue and expected behavior? Any thoughts?
Something like that might be possible, although people used to test
buffered-vs-direct fairly heavily.
generic_file_direct_IO() will run
filemap_write_and_wait()->filemap_fdatawrite() under i_mutex, and this
should run commits, write back dirty pages, etc.
There might remain races though, perhaps with page faults.
Hi,
On Fri 25-04-08 16:38:23, Mingming Cao wrote:
> While looking at a bug related to direct IO returns to EIO, after
> looking at the code, I found there is a window that
> try_to_free_buffers() from direct IO could race with JBD, which holds
> the reference to the data buffers before journal_commit_transaction()
> ensures the data buffers has reached to the disk.
>
> A little more detail: to prepare for direct IO, generic_file_direct_IO()
> calls invalidate_inode_pages2_range() to invalidate the pages in the
> cache before performaning direct IO. invalidate_inode_pages2_range()
> tries to free the buffers via try_to free_buffers(), but sometimes it
> can't, due to the buffers is possible still on some transaction's
> t_sync_datalist or t_locked_list waiting for
> journal_commit_transaction() to process it.
>
> Currently Direct IO simply returns EIO if try_to_free_buffers() finds
> the buffer is busy, as it has no clue that JBD is referencing it.
>
> Is this a known issue and expected behavior? Any thoughts?
Are you seeing this in data=ordered mode? As Andrew pointed out we do
filemap_write_and_wait() so all the relevant data buffers of the inode
should be already on disk. In __journal_try_to_free_buffer() we check
whether the buffer is already-written-out data buffer and unfile and free
it in that case. It shouldn't happen that a data buffer has
b_next_transaction set so really the only idea why try_to_free_buffers()
could fail is that somebody manages to write to a page via mmap before
invalidate_inode_pages2_range() gets to it. Under which kind of load do you
observe the problem? Do you know exactly because of which condition does
journal_try_to_free_buffers() fail?
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mon, 2008-04-28 at 14:26 +0200, Jan Kara wrote:
> Hi,
>
> On Fri 25-04-08 16:38:23, Mingming Cao wrote:
> > While looking at a bug related to direct IO returns to EIO, after
> > looking at the code, I found there is a window that
> > try_to_free_buffers() from direct IO could race with JBD, which holds
> > the reference to the data buffers before journal_commit_transaction()
> > ensures the data buffers has reached to the disk.
> >
> > A little more detail: to prepare for direct IO, generic_file_direct_IO()
> > calls invalidate_inode_pages2_range() to invalidate the pages in the
> > cache before performaning direct IO. invalidate_inode_pages2_range()
> > tries to free the buffers via try_to free_buffers(), but sometimes it
> > can't, due to the buffers is possible still on some transaction's
> > t_sync_datalist or t_locked_list waiting for
> > journal_commit_transaction() to process it.
> >
> > Currently Direct IO simply returns EIO if try_to_free_buffers() finds
> > the buffer is busy, as it has no clue that JBD is referencing it.
> >
> > Is this a known issue and expected behavior? Any thoughts?
> Are you seeing this in data=ordered mode? As Andrew pointed out we do
> filemap_write_and_wait() so all the relevant data buffers of the inode
> should be already on disk. In __journal_try_to_free_buffer() we check
> whether the buffer is already-written-out data buffer and unfile and free
> it in that case. It shouldn't happen that a data buffer has
> b_next_transaction set so really the only idea why try_to_free_buffers()
> could fail is that somebody manages to write to a page via mmap before
> invalidate_inode_pages2_range() gets to it. Under which kind of load do you
> observe the problem? Do you know exactly because of which condition does
> journal_try_to_free_buffers() fail?
>
Thank you for your reply.
What we are noticing is invalidate_inode_pages2_range() fails with -EIO
(from try_to_free_buffers() since b_count > 0).
I don't think the file is being updated through mmap(). Previous
writepage() added these buffers to t_sync_data list (data=ordered).
filemap_write_and_wait() waits for pagewrite back to be cleared.
So, buffers are no longer dirty, but still on the t_sync_data and
kjournald didn't get chance to process them yet :(
Since we have elevated b_count on these buffers, try_to_free_buffers()
fails. How can we make filemap_write_and_wait() to wait for kjournald
to unfile these buffers ?
Does this makes sense ? Am I missing something here ?
Thanks,
Badari
On Mon 28-04-08 10:11:34, Badari Pulavarty wrote:
>
> On Mon, 2008-04-28 at 14:26 +0200, Jan Kara wrote:
> > Hi,
> >
> > On Fri 25-04-08 16:38:23, Mingming Cao wrote:
> > > While looking at a bug related to direct IO returns to EIO, after
> > > looking at the code, I found there is a window that
> > > try_to_free_buffers() from direct IO could race with JBD, which holds
> > > the reference to the data buffers before journal_commit_transaction()
> > > ensures the data buffers has reached to the disk.
> > >
> > > A little more detail: to prepare for direct IO, generic_file_direct_IO()
> > > calls invalidate_inode_pages2_range() to invalidate the pages in the
> > > cache before performaning direct IO. invalidate_inode_pages2_range()
> > > tries to free the buffers via try_to free_buffers(), but sometimes it
> > > can't, due to the buffers is possible still on some transaction's
> > > t_sync_datalist or t_locked_list waiting for
> > > journal_commit_transaction() to process it.
> > >
> > > Currently Direct IO simply returns EIO if try_to_free_buffers() finds
> > > the buffer is busy, as it has no clue that JBD is referencing it.
> > >
> > > Is this a known issue and expected behavior? Any thoughts?
> > Are you seeing this in data=ordered mode? As Andrew pointed out we do
> > filemap_write_and_wait() so all the relevant data buffers of the inode
> > should be already on disk. In __journal_try_to_free_buffer() we check
> > whether the buffer is already-written-out data buffer and unfile and free
> > it in that case. It shouldn't happen that a data buffer has
> > b_next_transaction set so really the only idea why try_to_free_buffers()
> > could fail is that somebody manages to write to a page via mmap before
> > invalidate_inode_pages2_range() gets to it. Under which kind of load do you
> > observe the problem? Do you know exactly because of which condition does
> > journal_try_to_free_buffers() fail?
> >
>
> Thank you for your reply.
>
> What we are noticing is invalidate_inode_pages2_range() fails with -EIO
> (from try_to_free_buffers() since b_count > 0).
>
> I don't think the file is being updated through mmap(). Previous
> writepage() added these buffers to t_sync_data list (data=ordered).
> filemap_write_and_wait() waits for pagewrite back to be cleared.
> So, buffers are no longer dirty, but still on the t_sync_data and
> kjournald didn't get chance to process them yet :(
>
> Since we have elevated b_count on these buffers, try_to_free_buffers()
> fails. How can we make filemap_write_and_wait() to wait for kjournald
> to unfile these buffers ?
Hmm, I don't get one thing:
The call chain is invalidate_inode_pages2_range() ->
invalidate_complete_page2() -> try_to_release_page() -> ext3_releasepage()
-> journal_try_to_free_buffers() -> __journal_try_to_free_buffer() and this
function should remove the buffer from the committing transaction. So who's
holding the reference to those buffers? Or is it that
__journal_try_to_free_buffer() fails to remove the buffer from the
committing transaction? Why?
Hmm, maybe I have one idea - in theory we could call
__journal_try_to_free_buffer() exactly at the moment commit code inspects
the buffer. Then we'd release the buffer from the transaction but
try_to_free_buffers() would fail because of elevated b_count exactly as you
described. Could you maybe verify this? Not that I'd know how to easily fix
this ;)...
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mon, 2008-04-28 at 20:09 +0200, Jan Kara wrote:
> On Mon 28-04-08 10:11:34, Badari Pulavarty wrote:
> >
> > On Mon, 2008-04-28 at 14:26 +0200, Jan Kara wrote:
> > > Hi,
> > >
> > > On Fri 25-04-08 16:38:23, Mingming Cao wrote:
> > > > While looking at a bug related to direct IO returns to EIO, after
> > > > looking at the code, I found there is a window that
> > > > try_to_free_buffers() from direct IO could race with JBD, which holds
> > > > the reference to the data buffers before journal_commit_transaction()
> > > > ensures the data buffers has reached to the disk.
> > > >
> > > > A little more detail: to prepare for direct IO, generic_file_direct_IO()
> > > > calls invalidate_inode_pages2_range() to invalidate the pages in the
> > > > cache before performaning direct IO. invalidate_inode_pages2_range()
> > > > tries to free the buffers via try_to free_buffers(), but sometimes it
> > > > can't, due to the buffers is possible still on some transaction's
> > > > t_sync_datalist or t_locked_list waiting for
> > > > journal_commit_transaction() to process it.
> > > >
> > > > Currently Direct IO simply returns EIO if try_to_free_buffers() finds
> > > > the buffer is busy, as it has no clue that JBD is referencing it.
> > > >
> > > > Is this a known issue and expected behavior? Any thoughts?
> > > Are you seeing this in data=ordered mode? As Andrew pointed out we do
> > > filemap_write_and_wait() so all the relevant data buffers of the inode
> > > should be already on disk. In __journal_try_to_free_buffer() we check
> > > whether the buffer is already-written-out data buffer and unfile and free
> > > it in that case. It shouldn't happen that a data buffer has
> > > b_next_transaction set so really the only idea why try_to_free_buffers()
> > > could fail is that somebody manages to write to a page via mmap before
> > > invalidate_inode_pages2_range() gets to it. Under which kind of load do you
> > > observe the problem? Do you know exactly because of which condition does
> > > journal_try_to_free_buffers() fail?
> > >
> >
> > Thank you for your reply.
> >
> > What we are noticing is invalidate_inode_pages2_range() fails with -EIO
> > (from try_to_free_buffers() since b_count > 0).
> >
> > I don't think the file is being updated through mmap(). Previous
> > writepage() added these buffers to t_sync_data list (data=ordered).
> > filemap_write_and_wait() waits for pagewrite back to be cleared.
> > So, buffers are no longer dirty, but still on the t_sync_data and
> > kjournald didn't get chance to process them yet :(
> >
> > Since we have elevated b_count on these buffers, try_to_free_buffers()
> > fails. How can we make filemap_write_and_wait() to wait for kjournald
> > to unfile these buffers ?
> Hmm, I don't get one thing:
> The call chain is invalidate_inode_pages2_range() ->
> invalidate_complete_page2() -> try_to_release_page() -> ext3_releasepage()
> -> journal_try_to_free_buffers() -> __journal_try_to_free_buffer() and this
> function should remove the buffer from the committing transaction.
Thanks, yes I noticed that after you pointing this out.
But __journal_try_to_free_buffer() only unfile the buffer from
t_sync_datalist or t_locked list, the journal head is not removed in
journal_remove_journal_head() there, at that time,
journal_remove_journal_head() just check if counter b_jcount is 0. But
before calling __journal_try_to_free_buffer(), since
journal_try_to_free_buffers() already increase the b_jcount in
journal_grab_journal_head(), so the journal head is not removed in
__journal_try_to_free_buffer-> journal_remove_journal_head()
> So who's
> holding the reference to those buffers?
Looking at the code, it seems the it's the journal_put_journal_head(jh)
who remove the journal head and decrease the bh
journal_try_to_free_buffers()
{
...
jh = journal_grab_journal_head(bh);
if (!jh)
continue;
jbd_lock_bh_state(bh);
__journal_try_to_free_buffer(journal, bh);
journal_put_journal_head(jh);
jbd_unlock_bh_state(bh);
...
}
so when journal_put_journal_head()-> __journal_remove_journal_head(),
now the b_jcount is zero, but is
jh->b_transaction is NULL? So it seems possible that bh ref count is non
zero when exit from journal_put_journal_head() if jh_b_transaction is
not cleared.
I miss where jh->b_transaction is clear to NULL?
void journal_put_journal_head(struct journal_head *jh)
{
struct buffer_head *bh = jh2bh(jh);
jbd_lock_bh_journal_head(bh);
J_ASSERT_JH(jh, jh->b_jcount > 0);
--jh->b_jcount;
if (!jh->b_jcount && !jh->b_transaction) {
__journal_remove_journal_head(bh);
__brelse(bh);
}
jbd_unlock_bh_journal_head(bh);
}
Mingming
> Or is it that
> __journal_try_to_free_buffer() fails to remove the buffer from the
> committing transaction? Why?
> Hmm, maybe I have one idea - in theory we could call
> __journal_try_to_free_buffer() exactly at the moment commit code inspects
> the buffer. Then we'd release the buffer from the transaction but
> try_to_free_buffers() would fail because of elevated b_count exactly as you
> described. Could you maybe verify this? Not that I'd know how to easily fix
> this ;)...
>
> Honza
On Mon 28-04-08 12:09:23, Mingming Cao wrote:
> On Mon, 2008-04-28 at 20:09 +0200, Jan Kara wrote:
> > On Mon 28-04-08 10:11:34, Badari Pulavarty wrote:
> > >
> > > On Mon, 2008-04-28 at 14:26 +0200, Jan Kara wrote:
> > > > Hi,
> > > >
> > > > On Fri 25-04-08 16:38:23, Mingming Cao wrote:
> > > > > While looking at a bug related to direct IO returns to EIO, after
> > > > > looking at the code, I found there is a window that
> > > > > try_to_free_buffers() from direct IO could race with JBD, which holds
> > > > > the reference to the data buffers before journal_commit_transaction()
> > > > > ensures the data buffers has reached to the disk.
> > > > >
> > > > > A little more detail: to prepare for direct IO, generic_file_direct_IO()
> > > > > calls invalidate_inode_pages2_range() to invalidate the pages in the
> > > > > cache before performaning direct IO. invalidate_inode_pages2_range()
> > > > > tries to free the buffers via try_to free_buffers(), but sometimes it
> > > > > can't, due to the buffers is possible still on some transaction's
> > > > > t_sync_datalist or t_locked_list waiting for
> > > > > journal_commit_transaction() to process it.
> > > > >
> > > > > Currently Direct IO simply returns EIO if try_to_free_buffers() finds
> > > > > the buffer is busy, as it has no clue that JBD is referencing it.
> > > > >
> > > > > Is this a known issue and expected behavior? Any thoughts?
> > > > Are you seeing this in data=ordered mode? As Andrew pointed out we do
> > > > filemap_write_and_wait() so all the relevant data buffers of the inode
> > > > should be already on disk. In __journal_try_to_free_buffer() we check
> > > > whether the buffer is already-written-out data buffer and unfile and free
> > > > it in that case. It shouldn't happen that a data buffer has
> > > > b_next_transaction set so really the only idea why try_to_free_buffers()
> > > > could fail is that somebody manages to write to a page via mmap before
> > > > invalidate_inode_pages2_range() gets to it. Under which kind of load do you
> > > > observe the problem? Do you know exactly because of which condition does
> > > > journal_try_to_free_buffers() fail?
> > > >
> > >
> > > Thank you for your reply.
> > >
> > > What we are noticing is invalidate_inode_pages2_range() fails with -EIO
> > > (from try_to_free_buffers() since b_count > 0).
> > >
> > > I don't think the file is being updated through mmap(). Previous
> > > writepage() added these buffers to t_sync_data list (data=ordered).
> > > filemap_write_and_wait() waits for pagewrite back to be cleared.
> > > So, buffers are no longer dirty, but still on the t_sync_data and
> > > kjournald didn't get chance to process them yet :(
> > >
> > > Since we have elevated b_count on these buffers, try_to_free_buffers()
> > > fails. How can we make filemap_write_and_wait() to wait for kjournald
> > > to unfile these buffers ?
> > Hmm, I don't get one thing:
> > The call chain is invalidate_inode_pages2_range() ->
> > invalidate_complete_page2() -> try_to_release_page() -> ext3_releasepage()
> > -> journal_try_to_free_buffers() -> __journal_try_to_free_buffer() and this
> > function should remove the buffer from the committing transaction.
>
> Thanks, yes I noticed that after you pointing this out.
>
> But __journal_try_to_free_buffer() only unfile the buffer from
> t_sync_datalist or t_locked list, the journal head is not removed in
> journal_remove_journal_head() there, at that time,
> journal_remove_journal_head() just check if counter b_jcount is 0. But
> before calling __journal_try_to_free_buffer(), since
> journal_try_to_free_buffers() already increase the b_jcount in
> journal_grab_journal_head(), so the journal head is not removed in
> __journal_try_to_free_buffer-> journal_remove_journal_head()
>
> > So who's
> > holding the reference to those buffers?
>
> Looking at the code, it seems the it's the journal_put_journal_head(jh)
> who remove the journal head and decrease the bh
>
> journal_try_to_free_buffers()
> {
> ...
>
> jh = journal_grab_journal_head(bh);
> if (!jh)
> continue;
>
> jbd_lock_bh_state(bh);
> __journal_try_to_free_buffer(journal, bh);
> journal_put_journal_head(jh);
> jbd_unlock_bh_state(bh);
>
> ...
>
> }
> so when journal_put_journal_head()-> __journal_remove_journal_head(),
> now the b_jcount is zero, but is
> jh->b_transaction is NULL? So it seems possible that bh ref count is non
> zero when exit from journal_put_journal_head() if jh_b_transaction is
> not cleared.
>
> I miss where jh->b_transaction is clear to NULL?
__journal_unfile_buffer() called from __journal_try_to_free_buffer() sets
jh->b_transaction to NULL. So as soon as journal_put_journal_head() is
called, it results in freeing of journal head and releasing buffer
reference. So really the only possible race I see is what I describe
below...
> > Hmm, maybe I have one idea - in theory we could call
> > __journal_try_to_free_buffer() exactly at the moment commit code inspects
> > the buffer. Then we'd release the buffer from the transaction but
> > try_to_free_buffers() would fail because of elevated b_count exactly as you
> > described. Could you maybe verify this? Not that I'd know how to easily fix
> > this ;)...
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Tue, 2008-04-29 at 14:43 +0200, Jan Kara wrote:
> On Mon 28-04-08 12:09:23, Mingming Cao wrote:
> > On Mon, 2008-04-28 at 20:09 +0200, Jan Kara wrote:
> > > On Mon 28-04-08 10:11:34, Badari Pulavarty wrote:
> > > >
> > > > On Mon, 2008-04-28 at 14:26 +0200, Jan Kara wrote:
> > > > > Hi,
> > > > >
> > > > > On Fri 25-04-08 16:38:23, Mingming Cao wrote:
> > > > > > While looking at a bug related to direct IO returns to EIO, after
> > > > > > looking at the code, I found there is a window that
> > > > > > try_to_free_buffers() from direct IO could race with JBD, which holds
> > > > > > the reference to the data buffers before journal_commit_transaction()
> > > > > > ensures the data buffers has reached to the disk.
> > > > > >
> > > > > > A little more detail: to prepare for direct IO, generic_file_direct_IO()
> > > > > > calls invalidate_inode_pages2_range() to invalidate the pages in the
> > > > > > cache before performaning direct IO. invalidate_inode_pages2_range()
> > > > > > tries to free the buffers via try_to free_buffers(), but sometimes it
> > > > > > can't, due to the buffers is possible still on some transaction's
> > > > > > t_sync_datalist or t_locked_list waiting for
> > > > > > journal_commit_transaction() to process it.
> > > > > >
> > > > > > Currently Direct IO simply returns EIO if try_to_free_buffers() finds
> > > > > > the buffer is busy, as it has no clue that JBD is referencing it.
> > > > > >
> > > > > > Is this a known issue and expected behavior? Any thoughts?
> > > > > Are you seeing this in data=ordered mode? As Andrew pointed out we do
> > > > > filemap_write_and_wait() so all the relevant data buffers of the inode
> > > > > should be already on disk. In __journal_try_to_free_buffer() we check
> > > > > whether the buffer is already-written-out data buffer and unfile and free
> > > > > it in that case. It shouldn't happen that a data buffer has
> > > > > b_next_transaction set so really the only idea why try_to_free_buffers()
> > > > > could fail is that somebody manages to write to a page via mmap before
> > > > > invalidate_inode_pages2_range() gets to it. Under which kind of load do you
> > > > > observe the problem? Do you know exactly because of which condition does
> > > > > journal_try_to_free_buffers() fail?
> > > > >
> > > >
> > > > Thank you for your reply.
> > > >
> > > > What we are noticing is invalidate_inode_pages2_range() fails with -EIO
> > > > (from try_to_free_buffers() since b_count > 0).
> > > >
> > > > I don't think the file is being updated through mmap(). Previous
> > > > writepage() added these buffers to t_sync_data list (data=ordered).
> > > > filemap_write_and_wait() waits for pagewrite back to be cleared.
> > > > So, buffers are no longer dirty, but still on the t_sync_data and
> > > > kjournald didn't get chance to process them yet :(
> > > >
> > > > Since we have elevated b_count on these buffers, try_to_free_buffers()
> > > > fails. How can we make filemap_write_and_wait() to wait for kjournald
> > > > to unfile these buffers ?
> > > Hmm, I don't get one thing:
> > > The call chain is invalidate_inode_pages2_range() ->
> > > invalidate_complete_page2() -> try_to_release_page() -> ext3_releasepage()
> > > -> journal_try_to_free_buffers() -> __journal_try_to_free_buffer() and this
> > > function should remove the buffer from the committing transaction.
> >
> > Thanks, yes I noticed that after you pointing this out.
> >
> > But __journal_try_to_free_buffer() only unfile the buffer from
> > t_sync_datalist or t_locked list, the journal head is not removed in
> > journal_remove_journal_head() there, at that time,
> > journal_remove_journal_head() just check if counter b_jcount is 0. But
> > before calling __journal_try_to_free_buffer(), since
> > journal_try_to_free_buffers() already increase the b_jcount in
> > journal_grab_journal_head(), so the journal head is not removed in
> > __journal_try_to_free_buffer-> journal_remove_journal_head()
> >
> > > So who's
> > > holding the reference to those buffers?
> >
> > Looking at the code, it seems the it's the journal_put_journal_head(jh)
> > who remove the journal head and decrease the bh
> >
> > journal_try_to_free_buffers()
> > {
> > ...
> >
> > jh = journal_grab_journal_head(bh);
> > if (!jh)
> > continue;
> >
> > jbd_lock_bh_state(bh);
> > __journal_try_to_free_buffer(journal, bh);
> > journal_put_journal_head(jh);
> > jbd_unlock_bh_state(bh);
> >
> > ...
> >
> > }
> > so when journal_put_journal_head()-> __journal_remove_journal_head(),
> > now the b_jcount is zero, but is
> > jh->b_transaction is NULL? So it seems possible that bh ref count is non
> > zero when exit from journal_put_journal_head() if jh_b_transaction is
> > not cleared.
> >
> > I miss where jh->b_transaction is clear to NULL?
> __journal_unfile_buffer() called from __journal_try_to_free_buffer() sets
> jh->b_transaction to NULL. So as soon as journal_put_journal_head() is
> called, it results in freeing of journal head and releasing buffer
> reference.
Thanks, I saw this piece of code after I post it.
> So really the only possible race I see is what I describe
> below...
>
> > > Hmm, maybe I have one idea - in theory we could call
> > > __journal_try_to_free_buffer() exactly at the moment commit code inspects
> > > the buffer. Then we'd release the buffer from the transaction but
> > > try_to_free_buffers() would fail because of elevated b_count exactly as you
> > > described. Could you maybe verify this? Not that I'd know how to easily fix
> > > this ;)...
>
here are some details:
The customer workload involves direct IO and buffered IO. The saw EIO
gets returned without any log messages. Initial probing via SystemTap
shows:
drop_buffers returns 0
try_to_free_buffers returns 0
try_to_release_page returns 0
drop_buffers returns 0
try_to_free_buffers returns 0
try_to_release_page returns 0
drop_buffers returns 0
try_to_free_buffers returns 0
try_to_release_page returns 0
invalidate_inode_pages2_range returns -5 (EIO)
drop_buffers returns 0
try_to_free_buffers returns 0
try_to_release_page returns 0
Which indicating that the EIO is from the
invalidate_inode_pages2_range(), which tries to free buffers but
failed.
We will try to add more debug information. Thanks for the suggestions.
However, Since ext3 has releasepge method defined, so the
try_to_free_buffer() failure should from
try_to_release_page()->ext3_releasepage()->journal_try_to_free_buffers()->try_to_free_buffer(), instead of try_to_release_page() calling try_to_free_buffer() directly.
If journal_try_to_free_buffers() calls try to free_buffer(), that means
the journal head is already successfully removed by
journal_remove_journal_head(), so buffer_jbd() safty checking after it
is false as expected. Otherwise try_to_free_buffer() won't be called.
In that case, I am not sure if it is possible to have race with commit
code?? we seems have j_list_lock protected when
__journal_try_to_free_buffer() is trying to take the buffer off the
list.
There are many other try_to_release_page() failure before the DIO EIO,
not sure where those coming from.
Fortunately Badari is able to reproduce this problem via simple buffered
write and direct write to the same file on 2.6.25-git12. We could add
more debug info there to see if we could get the counter and the jh
values out when try to free a busy buffer.
> Honza
Hi Andrew & Jan,
I was able to reproduce the customer problem involving DIO
(invalidate_inode_pages2) problem by writing simple testcase
to keep writing to a file using buffered writes and DIO writes
forever in a loop. I see DIO writes fail with -EIO.
After a long debug, found 2 cases how this could happen.
These are race conditions with journal_try_to_free_buffers()
and journal_commit_transaction().
1) journal_submit_data_buffers() tries to get bh_state lock. If
try lock fails, it drops the j_list_lock and sleeps for
bh_state lock, while holding a reference on the buffer.
In the meanwhile, journal_try_to_free_buffers() can clean up the
journal head could call try_to_free_buffers(). try_to_free_buffers()
would fail due to the reference held by journal_submit_data_buffers()
- which in turn causes failues for DIO (invalidate_inode_pages2()).
2) When the buffer is on t_locked_list waiting for IO to finish,
we hold a reference and give up the cpu, if we can't get
bh_state lock. This causes try_to_free_buffers() to fail.
Fix is to drop the reference on the buffer if we can't get
bh_state lock, give up the cpu and re-try the whole operation -
instead of waiting for the vh_state lock.
Does this look like a resonable fix ?
Thanks,
Badari
1) journal_submit_data_buffers() tries to get bh_state lock. If
try lock fails, it drops the j_list_lock and sleeps for
bh_state lock, while holding a reference on the buffer head.
In the meanwhile, journal_try_to_free_buffers() can clean up the
journal head could call try_to_free_buffers(). try_to_free_buffers()
would fail due to the reference held by journal_submit_data_buffers()
- which inturn causes failues for DIO (invalidate_inode_pages2()).
2) When the buffer is on t_locked_list waiting for IO to finish,
we hold a reference and give up the cpu, if we can't get
bh_state lock. This causes try_to_free_buffers() to fail.
Fix is to drop the reference on the buffer, give up the cpu
and re-try the whole operation.
Signed-off-by: Badari Pulavarty <[email protected]>
Reviewed-by: Mingming Cao <[email protected]>
---
fs/jbd/commit.c | 20 +++++++++++++-------
fs/jbd2/commit.c | 20 +++++++++++++-------
2 files changed, 26 insertions(+), 14 deletions(-)
Index: linux-2.6.25/fs/jbd/commit.c
===================================================================
--- linux-2.6.25.orig/fs/jbd/commit.c 2008-04-30 08:47:14.000000000 -0700
+++ linux-2.6.25/fs/jbd/commit.c 2008-05-01 07:56:20.000000000 -0700
@@ -79,12 +79,16 @@ nope:
/*
* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held. For ranking reasons we must trylock. If we lose, schedule away and
+ * held. For ranking reasons we must trylock. If we lose, unlock the buffer
+ * if needed, drop the reference on the buffer, schedule away and
* return 0. j_list_lock is dropped in this case.
*/
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
+static int inverted_lock(journal_t *journal, struct buffer_head *bh, int locked)
{
if (!jbd_trylock_bh_state(bh)) {
+ if (locked)
+ unlock_buffer(bh);
+ put_bh(bh);
spin_unlock(&journal->j_list_lock);
schedule();
return 0;
@@ -218,10 +222,13 @@ write_out_data:
}
locked = 1;
}
- /* We have to get bh_state lock. Again out of order, sigh. */
- if (!inverted_lock(journal, bh)) {
- jbd_lock_bh_state(bh);
+ /*
+ * We have to get bh_state lock. If the try lock fails, give up
+ * cpu and retry the whole operation.
+ */
+ if (!inverted_lock(journal, bh, locked)) {
spin_lock(&journal->j_list_lock);
+ continue;
}
/* Someone already cleaned up the buffer? */
if (!buffer_jbd(bh)
@@ -430,8 +437,7 @@ void journal_commit_transaction(journal_
err = -EIO;
spin_lock(&journal->j_list_lock);
}
- if (!inverted_lock(journal, bh)) {
- put_bh(bh);
+ if (!inverted_lock(journal, bh, 0)) {
spin_lock(&journal->j_list_lock);
continue;
}
Index: linux-2.6.25/fs/jbd2/commit.c
===================================================================
--- linux-2.6.25.orig/fs/jbd2/commit.c 2008-04-30 08:47:14.000000000 -0700
+++ linux-2.6.25/fs/jbd2/commit.c 2008-05-01 07:56:26.000000000 -0700
@@ -81,12 +81,16 @@ nope:
/*
* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held. For ranking reasons we must trylock. If we lose, schedule away and
+ * held. For ranking reasons we must trylock. If we lose, unlock the buffer
+ * if needed, drop the reference on the buffer, schedule away and
* return 0. j_list_lock is dropped in this case.
*/
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
+static int inverted_lock(journal_t *journal, struct buffer_head *bh, int locked)
{
if (!jbd_trylock_bh_state(bh)) {
+ if (locked)
+ unlock_buffer(bh);
+ put_bh(bh);
spin_unlock(&journal->j_list_lock);
schedule();
return 0;
@@ -217,8 +221,7 @@ static int journal_wait_on_locked_list(j
ret = -EIO;
spin_lock(&journal->j_list_lock);
}
- if (!inverted_lock(journal, bh)) {
- put_bh(bh);
+ if (!inverted_lock(journal, bh, 0)) {
spin_lock(&journal->j_list_lock);
continue;
}
@@ -296,10 +299,13 @@ write_out_data:
}
locked = 1;
}
- /* We have to get bh_state lock. Again out of order, sigh. */
- if (!inverted_lock(journal, bh)) {
- jbd_lock_bh_state(bh);
+ /*
+ * We have to get bh_state lock. If the try lock fails, give up
+ * cpu and retry the whole operation.
+ */
+ if (!inverted_lock(journal, bh, locked)) {
spin_lock(&journal->j_list_lock);
+ continue;
}
/* Someone already cleaned up the buffer? */
if (!buffer_jbd(bh)
On Thu, 2008-05-01 at 08:16 -0700, Badari Pulavarty wrote:
> Hi Andrew & Jan,
>
> I was able to reproduce the customer problem involving DIO
> (invalidate_inode_pages2) problem by writing simple testcase
> to keep writing to a file using buffered writes and DIO writes
> forever in a loop. I see DIO writes fail with -EIO.
>
> After a long debug, found 2 cases how this could happen.
> These are race conditions with
> and journal_commit_transaction().
>
> 1) journal_submit_data_buffers() tries to get bh_state lock. If
> try lock fails, it drops the j_list_lock and sleeps for
> bh_state lock, while holding a reference on the buffer.
> In the meanwhile, journal_try_to_free_buffers() can clean up the
> journal head could call try_to_free_buffers(). try_to_free_buffers()
> would fail due to the reference held by journal_submit_data_buffers()
> - which in turn causes failues for DIO (invalidate_inode_pages2()).
>
> 2) When the buffer is on t_locked_list waiting for IO to finish,
> we hold a reference and give up the cpu, if we can't get
> bh_state lock. This causes try_to_free_buffers() to fail.
>
Besides these two, I think there are two more race conditions with
journal_try_to_free_buffers() inside
journal_commit_transaction()->journal_submit_data_buffers()
3) when journal_submit_data_buffers() saw the buffer is dirty but failed
to lock the buffer bh1, journal_submit_data_buffers() released the
j_list_lock and submit other buffers collected from previous check, with
the reference to bh1 still hold. During this time
journal_try_to_free_buffers() could clean up the journal head of bh1 and
remove it from the t_syncdata_list. Then try_to_free_buffers() would
fail because the reference held by journal_submit_data_buffers()
...
if (buffer_dirty(bh)) {
if (test_set_buffer_locked(bh)) {
BUFFER_TRACE(bh, "needs blocking lock");
spin_unlock(&journal->j_list_lock);
<-- here release the j_list_lock without put(bh)
journal_try_to_free_buffers() could come in and remove this bh from t_syncdata_list
/* Write out all data to prevent deadlocks */
journal_do_submit_data(wbuf, bufs);
bufs = 0;
lock_buffer(bh);
spin_lock(&journal->j_list_lock);
<-- here continue the check without validate if the bh still on t_sycdata_list
}
locked = 1;
}
4) when journal_commit_transaction() go through the t_locked_list and
wait for the buffer to be unlocked, it still holds the reference to the
buffer, released the j_list_lock and gives the
journal_try_to_free_buffers() a chance to come in remove this buffer
from t_locked_list, but journal_commit_transaction() continues as if the
buffer still on the locked list.
while (commit_transaction->t_locked_list) {
struct buffer_head *bh;
jh = commit_transaction->t_locked_list->b_tprev;
bh = jh2bh(jh);
get_bh(bh);
if (buffer_locked(bh)) {
spin_unlock(&journal->j_list_lock);
wait_on_buffer(bh);
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
spin_lock(&journal->j_list_lock);
}
Mingming
On Thu 01-05-08 08:16:21, Badari Pulavarty wrote:
> Hi Andrew & Jan,
>
> I was able to reproduce the customer problem involving DIO
> (invalidate_inode_pages2) problem by writing simple testcase
> to keep writing to a file using buffered writes and DIO writes
> forever in a loop. I see DIO writes fail with -EIO.
>
> After a long debug, found 2 cases how this could happen.
> These are race conditions with journal_try_to_free_buffers()
> and journal_commit_transaction().
>
> 1) journal_submit_data_buffers() tries to get bh_state lock. If
> try lock fails, it drops the j_list_lock and sleeps for
> bh_state lock, while holding a reference on the buffer.
> In the meanwhile, journal_try_to_free_buffers() can clean up the
> journal head could call try_to_free_buffers(). try_to_free_buffers()
> would fail due to the reference held by journal_submit_data_buffers()
> - which in turn causes failues for DIO (invalidate_inode_pages2()).
>
> 2) When the buffer is on t_locked_list waiting for IO to finish,
> we hold a reference and give up the cpu, if we can't get
> bh_state lock. This causes try_to_free_buffers() to fail.
>
> Fix is to drop the reference on the buffer if we can't get
> bh_state lock, give up the cpu and re-try the whole operation -
> instead of waiting for the vh_state lock.
>
> Does this look like a resonable fix ?
As Mingming pointed out there are few other places where we could hold
the bh reference. Note also that we accumulate references to buffers in the
wbuf[] list and we need that for submit_bh() which consumes one bh
reference. Generally, it seems to me as a too fragile and impractical
rule "nobody can hold bh reference when not holding page lock" which is
basically what it comes down to if you really want to be sure that
journal_try_to_free_buffers() succeeds. And also note that in principle
there are other places which hold references to buffers without holding the
page lock - for example writepage() in ordered mode (although this one is
in practice hardly triggerable). So how we could fix at least the races
with commit code is to implement launder_page() callback for ext3/4 which
would wait for the previous transaction commit in case the page has buffers
that are part of that commit (I don't want this logic in
journal_try_to_free_buffers() as that is called also on memory-reclaim
path, but journal_launder_page() is fine with me). This would be correct
but could considerably slow down O_DIRECT writes in cases they're mixed
with buffered writes so I'm not sure if this is acceptable.
OTOH with the ordered mode rewrite patch, the problem with commit code
also goes away since there we don't need extra references to data buffers
(we use just filemap_fdatawrite).
> 1) journal_submit_data_buffers() tries to get bh_state lock. If
> try lock fails, it drops the j_list_lock and sleeps for
> bh_state lock, while holding a reference on the buffer head.
> In the meanwhile, journal_try_to_free_buffers() can clean up the
> journal head could call try_to_free_buffers(). try_to_free_buffers()
> would fail due to the reference held by journal_submit_data_buffers()
> - which inturn causes failues for DIO (invalidate_inode_pages2()).
>
> 2) When the buffer is on t_locked_list waiting for IO to finish,
> we hold a reference and give up the cpu, if we can't get
> bh_state lock. This causes try_to_free_buffers() to fail.
>
> Fix is to drop the reference on the buffer, give up the cpu
> and re-try the whole operation.
>
> Signed-off-by: Badari Pulavarty <[email protected]>
> Reviewed-by: Mingming Cao <[email protected]>
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mon, 2008-05-05 at 19:06 +0200, Jan Kara wrote:
> On Thu 01-05-08 08:16:21, Badari Pulavarty wrote:
> > Hi Andrew & Jan,
> >
> > I was able to reproduce the customer problem involving DIO
> > (invalidate_inode_pages2) problem by writing simple testcase
> > to keep writing to a file using buffered writes and DIO writes
> > forever in a loop. I see DIO writes fail with -EIO.
> >
> > After a long debug, found 2 cases how this could happen.
> > These are race conditions with journal_try_to_free_buffers()
> > and journal_commit_transaction().
> >
> > 1) journal_submit_data_buffers() tries to get bh_state lock. If
> > try lock fails, it drops the j_list_lock and sleeps for
> > bh_state lock, while holding a reference on the buffer.
> > In the meanwhile, journal_try_to_free_buffers() can clean up the
> > journal head could call try_to_free_buffers(). try_to_free_buffers()
> > would fail due to the reference held by journal_submit_data_buffers()
> > - which in turn causes failues for DIO (invalidate_inode_pages2()).
> >
> > 2) When the buffer is on t_locked_list waiting for IO to finish,
> > we hold a reference and give up the cpu, if we can't get
> > bh_state lock. This causes try_to_free_buffers() to fail.
> >
> > Fix is to drop the reference on the buffer if we can't get
> > bh_state lock, give up the cpu and re-try the whole operation -
> > instead of waiting for the vh_state lock.
> >
> > Does this look like a resonable fix ?
> As Mingming pointed out there are few other places where we could hold
> the bh reference.
Actually there is one more place that journal_try_to_free_buffers
(calling from DIO path) could race with journal_submit_data_buffers(),
the DIO and buffered IO test still returns EIO with the fix which should
fixed the first 3 race cases.
I could not figure out how this could happen with Badari's fix to
inverted_lock().
This time the debug shows that the one who release put_bh() after the
journal_try_to_free_buffers() failed is from this code path:
journal_submit_data_buffers() {
...
} else if (!locked && buffer_locked(bh)) {
__journal_file_buffer(jh, commit_transaction,
BJ_Locked);
jbd_unlock_bh_state(bh);
put_bh(bh);
}
But when we get here we should already making sure the bh is on the
t_syncdata_list with badari's fix...
> Note also that we accumulate references to buffers in the
> wbuf[] list and we need that for submit_bh() which consumes one bh
> reference.
It seems to me it's safe in this case. When
journal_try_to_free_buffers() called from DIO path,
filemap_fdatawrite_and_wait() already making sure that the IO submitted
by kjournald is already finished by waiting for buffer unlocked.
> Generally, it seems to me as a too fragile and impractical
> rule "nobody can hold bh reference when not holding page lock" which is
> basically what it comes down to if you really want to be sure that
> journal_try_to_free_buffers() succeeds. And also note that in principle
> there are other places which hold references to buffers without holding the
> page lock - for example writepage() in ordered mode (although this one is
> in practice hardly triggerable). So how we could fix at least the races
> with commit code is to implement launder_page() callback for ext3/4 which
> would wait for the previous transaction commit in case the page has buffers
> that are part of that commit (I don't want this logic in
> journal_try_to_free_buffers() as that is called also on memory-reclaim
> path, but journal_launder_page() is fine with me). This would be correct
> but could considerably slow down O_DIRECT writes in cases they're mixed
> with buffered writes so I'm not sure if this is acceptable.
> OTOH with the ordered mode rewrite patch, the problem with commit code
> also goes away since there we don't need extra references to data buffers
> (we use just filemap_fdatawrite).
>
> > 1) journal_submit_data_buffers() tries to get bh_state lock. If
> > try lock fails, it drops the j_list_lock and sleeps for
> > bh_state lock, while holding a reference on the buffer head.
> > In the meanwhile, journal_try_to_free_buffers() can clean up the
> > journal head could call try_to_free_buffers(). try_to_free_buffers()
> > would fail due to the reference held by journal_submit_data_buffers()
> > - which inturn causes failues for DIO (invalidate_inode_pages2()).
> >
> > 2) When the buffer is on t_locked_list waiting for IO to finish,
> > we hold a reference and give up the cpu, if we can't get
> > bh_state lock. This causes try_to_free_buffers() to fail.
> >
> > Fix is to drop the reference on the buffer, give up the cpu
> > and re-try the whole operation.
> >
> > Signed-off-by: Badari Pulavarty <[email protected]>
> > Reviewed-by: Mingming Cao <[email protected]>
>
> Honza
On Mon, 2008-05-05 at 19:06 +0200, Jan Kara wrote:
> On Thu 01-05-08 08:16:21, Badari Pulavarty wrote:
> > Hi Andrew & Jan,
> >
> > I was able to reproduce the customer problem involving DIO
> > (invalidate_inode_pages2) problem by writing simple testcase
> > to keep writing to a file using buffered writes and DIO writes
> > forever in a loop. I see DIO writes fail with -EIO.
> >
> > After a long debug, found 2 cases how this could happen.
> > These are race conditions with journal_try_to_free_buffers()
> > and journal_commit_transaction().
> >
> > 1) journal_submit_data_buffers() tries to get bh_state lock. If
> > try lock fails, it drops the j_list_lock and sleeps for
> > bh_state lock, while holding a reference on the buffer.
> > In the meanwhile, journal_try_to_free_buffers() can clean up the
> > journal head could call try_to_free_buffers(). try_to_free_buffers()
> > would fail due to the reference held by journal_submit_data_buffers()
> > - which in turn causes failues for DIO (invalidate_inode_pages2()).
> >
> > 2) When the buffer is on t_locked_list waiting for IO to finish,
> > we hold a reference and give up the cpu, if we can't get
> > bh_state lock. This causes try_to_free_buffers() to fail.
> >
> > Fix is to drop the reference on the buffer if we can't get
> > bh_state lock, give up the cpu and re-try the whole operation -
> > instead of waiting for the vh_state lock.
> >
> > Does this look like a resonable fix ?
> As Mingming pointed out there are few other places where we could hold
> the bh reference. Note also that we accumulate references to buffers in the
> wbuf[] list and we need that for submit_bh() which consumes one bh
> reference. Generally, it seems to me as a too fragile and impractical
> rule "nobody can hold bh reference when not holding page lock" which is
> basically what it comes down to if you really want to be sure that
> journal_try_to_free_buffers() succeeds. And also note that in principle
> there are other places which hold references to buffers without holding the
> page lock - for example writepage() in ordered mode (although this one is
> in practice hardly triggerable). So how we could fix at least the races
> with commit code is to implement launder_page() callback for ext3/4 which
> would wait for the previous transaction commit in case the page has buffers
> that are part of that commit (I don't want this logic in
> journal_try_to_free_buffers() as that is called also on memory-reclaim
> path, but journal_launder_page() is fine with me). This would be correct
> but could considerably slow down O_DIRECT writes in cases they're mixed
> with buffered writes so I'm not sure if this is acceptable.
Yes. I have been discussing all of these with Mingming. I agree with you
that it looks silly to update all the places to not hold a ref on the
buffer while waiting for spinlocks.
Adding a launder_page() seems to be the right approach. I wouldn't worry
about making O_DIRECT slower. Currently, its failing with -EIO in this
case anyway.
Unfortunately, this happens at a customer site and I can't effort to
give them a complete re-write type patch - I need to get this into
older distro release :( Thats why I am trying to fix the cases.
I am finding more and more of these ..
Thanks,
Badari
On Mon, 2008-05-05 at 19:06 +0200, Jan Kara wrote:
> On Thu 01-05-08 08:16:21, Badari Pulavarty wrote:
> > Hi Andrew & Jan,
> >
> > I was able to reproduce the customer problem involving DIO
> > (invalidate_inode_pages2) problem by writing simple testcase
> > to keep writing to a file using buffered writes and DIO writes
> > forever in a loop. I see DIO writes fail with -EIO.
> >
> > After a long debug, found 2 cases how this could happen.
> > These are race conditions with journal_try_to_free_buffers()
> > and journal_commit_transaction().
> >
> > 1) journal_submit_data_buffers() tries to get bh_state lock. If
> > try lock fails, it drops the j_list_lock and sleeps for
> > bh_state lock, while holding a reference on the buffer.
> > In the meanwhile, journal_try_to_free_buffers() can clean up the
> > journal head could call try_to_free_buffers(). try_to_free_buffers()
> > would fail due to the reference held by journal_submit_data_buffers()
> > - which in turn causes failues for DIO (invalidate_inode_pages2()).
> >
> > 2) When the buffer is on t_locked_list waiting for IO to finish,
> > we hold a reference and give up the cpu, if we can't get
> > bh_state lock. This causes try_to_free_buffers() to fail.
> >
> > Fix is to drop the reference on the buffer if we can't get
> > bh_state lock, give up the cpu and re-try the whole operation -
> > instead of waiting for the vh_state lock.
> >
> > Does this look like a resonable fix ?
> As Mingming pointed out there are few other places where we could hold
> the bh reference. Note also that we accumulate references to buffers in the
> wbuf[] list and we need that for submit_bh() which consumes one bh
> reference. Generally, it seems to me as a too fragile and impractical
> rule "nobody can hold bh reference when not holding page lock" which is
> basically what it comes down to if you really want to be sure that
> journal_try_to_free_buffers() succeeds. And also note that in principle
> there are other places which hold references to buffers without holding the
> page lock - for example writepage() in ordered mode (although this one is
> in practice hardly triggerable). So how we could fix at least the races
> with commit code is to implement launder_page() callback for ext3/4 which
> would wait for the previous transaction commit in case the page has buffers
> that are part of that commit (I don't want this logic in
> journal_try_to_free_buffers() as that is called also on memory-reclaim
> path, but journal_launder_page() is fine with me).
I am not sure how we are going to gurantee that by the time
journal_try_to_free_buffers() get called, the page has buffers are not
as part of the current transaction commit(which could be different than
the one we waited in ext3_launder_page())?
It seems more realistic to fix the races one by one to me.
There is still a window that journal_submit_data_buffers() already
removed the jh from the bh (when found the buffers are already being
synced), but still keep a reference to the buffer head.
journal_try_to_free_buffers() could be called. In that case
try_to_free_buffers() will be called since there is no jh related to
this buffer, and failed due to journal_submit_data_buffers() hasn't
finish the cleanup business yet.
For this new race, we could just grab the j_list_lock when re-try
try_to_free_buffers() to force waiting for journal_commit_transaction()
to finish it flush work. But not sure if this is acceptable approach?
Patch like this? Comments?
Mingming
---------------------------------------------------------------------
There are a few cases direct IO could race with kjournal flushing
data buffers which could result direct IO return EIO error.
1) journal_submit_data_buffers() tries to get bh_state lock. If
try lock fails, it drops the j_list_lock and sleeps for
bh_state lock, while holding a reference on the buffer.
In the meanwhile, journal_try_to_free_buffers() can clean up the
journal head could call try_to_free_buffers(). try_to_free_buffers()
would fail due to the reference held by journal_submit_data_buffers()
- which in turn causes failues for DIO (invalidate_inode_pages2()).
2) When the buffer is on t_locked_list waiting for IO to finish,
we hold a reference and give up the cpu, if we can't get
bh_state lock. This causes try_to_free_buffers() to fail.
3) when journal_submit_data_buffers() saw the buffer is dirty but failed
to lock the buffer bh1, journal_submit_data_buffers() released the
j_list_lock and submit other buffers collected from previous check, with
the reference to bh1 still hold. During this time
journal_try_to_free_buffers() could clean up the journal head of bh1 and
remove it from the t_syncdata_list. Then try_to_free_buffers() would
fail because the reference held by journal_submit_data_buffers()
4) journal_submit_data_buffers() already removed the jh from the bh
(when found the buffers are already being synced), but still keep a
reference to the buffer head. journal_try_to_free_buffers() could be
called. In that case try_to_free_buffers() will be called since there is
no jh related to this buffer, and failed due to
journal_submit_data_buffers() hasn't finish the cleanup business yet.
Fix for first three races is to drop the reference on the buffer head
when release the j_list_lock,
give up the cpu and re-try the whole operation.
This patch also fixes the race that data buffers has been
flushed to disk and journal head is cleard
by journal_submit_data_buffers() but did not get a chance to release
buffer head reference before the journal_try_to_free_buffers() kicked in.
Signed-off-by: Badari Pulavarty <[email protected]>
Signed-off-by: Mingming Cao <[email protected]>
---
fs/jbd/commit.c | 21 ++++++++++++++++-----
fs/jbd/transaction.c | 13 +++++++++++++
2 files changed, 29 insertions(+), 5 deletions(-)
Index: linux-2.6.26-rc1/fs/jbd/commit.c
===================================================================
--- linux-2.6.26-rc1.orig/fs/jbd/commit.c 2008-05-03 11:59:44.000000000 -0700
+++ linux-2.6.26-rc1/fs/jbd/commit.c 2008-05-09 14:44:36.000000000 -0700
@@ -79,12 +79,16 @@ nope:
/*
* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held. For ranking reasons we must trylock. If we lose, schedule away and
+ * held. For ranking reasons we must trylock. If we lose, unlock the buffer
+ * if needed, drop the reference on the buffer, schedule away and
* return 0. j_list_lock is dropped in this case.
*/
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
+static int inverted_lock(journal_t *journal, struct buffer_head *bh, int locked)
{
if (!jbd_trylock_bh_state(bh)) {
+ if (locked)
+ unlock_buffer(bh);
+ put_bh(bh);
spin_unlock(&journal->j_list_lock);
schedule();
return 0;
@@ -209,19 +213,24 @@ write_out_data:
if (buffer_dirty(bh)) {
if (test_set_buffer_locked(bh)) {
BUFFER_TRACE(bh, "needs blocking lock");
+ put_bh(bh);
spin_unlock(&journal->j_list_lock);
/* Write out all data to prevent deadlocks */
journal_do_submit_data(wbuf, bufs);
bufs = 0;
- lock_buffer(bh);
spin_lock(&journal->j_list_lock);
+ continue;
}
locked = 1;
}
- /* We have to get bh_state lock. Again out of order, sigh. */
- if (!inverted_lock(journal, bh)) {
- jbd_lock_bh_state(bh);
+ /*
+ * We have to get bh_state lock. If the try lock fails,
+ * release the ref on the buffer, give up cpu and retry the
+ * whole operation.
+ */
+ if (!inverted_lock(journal, bh, locked)) {
spin_lock(&journal->j_list_lock);
+ continue;
}
/* Someone already cleaned up the buffer? */
if (!buffer_jbd(bh)
@@ -430,8 +439,7 @@ void journal_commit_transaction(journal_
err = -EIO;
spin_lock(&journal->j_list_lock);
}
- if (!inverted_lock(journal, bh)) {
- put_bh(bh);
+ if (!inverted_lock(journal, bh, 0)) {
spin_lock(&journal->j_list_lock);
continue;
}
Index: linux-2.6.26-rc1/fs/jbd/transaction.c
===================================================================
--- linux-2.6.26-rc1.orig/fs/jbd/transaction.c 2008-05-03 11:59:44.000000000 -0700
+++ linux-2.6.26-rc1/fs/jbd/transaction.c 2008-05-09 09:53:57.000000000 -0700
@@ -1714,6 +1714,19 @@ int journal_try_to_free_buffers(journal_
goto busy;
} while ((bh = bh->b_this_page) != head);
ret = try_to_free_buffers(page);
+ if (ret == 0) {
+ /*
+ * it is possible that journal_submit_data_buffers()
+ * still holds the bh ref even if clears the jh
+ * after journal_remove_journal_head,
+ * which leads to try_to_free_buffers() failed
+ * let's wait for journal_submit_data_buffers()
+ * to finishing remove the bh from the sync_data_list
+ */
+ spin_lock(&journal->j_list_lock);
+ ret = try_to_free_buffers(page);
+ spin_unlock(&journal->j_list_lock);
+ }
busy:
return ret;
}
Updating the current transaction's t_state is being protected by j_state_lock.
We need to the same when updating the t_state to T_COMMIT.
Signed-off-by: Mingming Cao <[email protected]>
---
fs/jbd2/commit.c | 2 ++
1 file changed, 2 insertions(+)
Index: linux-2.6.26-rc1/fs/jbd2/commit.c
===================================================================
--- linux-2.6.26-rc1.orig/fs/jbd2/commit.c 2008-05-03 11:59:44.000000000 -0700
+++ linux-2.6.26-rc1/fs/jbd2/commit.c 2008-05-09 13:32:01.000000000 -0700
@@ -560,7 +560,9 @@ void jbd2_journal_commit_transaction(jou
* transaction! Now comes the tricky part: we need to write out
* metadata. Loop over the transaction's entire buffer list:
*/
+ spin_lock(&journal->j_state_lock);
commit_transaction->t_state = T_COMMIT;
+ spin_unlock(&journal->j_state_lock);
stats.u.run.rs_logging = jiffies;
stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
Updating the current transaction's t_state is being protected by j_state_lock.
We need to do the same when updating the t_state to T_COMMIT.
Signed-off-by: Mingming Cao <[email protected]>
---
fs/jbd/commit.c | 2 ++
1 file changed, 2 insertions(+)
Index: linux-2.6.26-rc1/fs/jbd/commit.c
===================================================================
--- linux-2.6.26-rc1.orig/fs/jbd/commit.c 2008-05-09 14:46:25.000000000 -0700
+++ linux-2.6.26-rc1/fs/jbd/commit.c 2008-05-09 15:11:00.000000000 -0700
@@ -478,7 +478,9 @@ void journal_commit_transaction(journal_
* transaction! Now comes the tricky part: we need to write out
* metadata. Loop over the transaction's entire buffer list:
*/
+ spin_lock(&journal->j_state_lock);
commit_transaction->t_state = T_COMMIT;
+ spin_unlock(&journal->j_state_lock);
J_ASSERT(commit_transaction->t_nr_buffers <=
commit_transaction->t_outstanding_credits);
On Fri 09-05-08 15:39:43, Mingming Cao wrote:
> Updating the current transaction's t_state is being protected by j_state_lock.
> We need to do the same when updating the t_state to T_COMMIT.
>
> Signed-off-by: Mingming Cao <[email protected]>
Thanks for the fix. You can add
Acked-by: Jan Kara <[email protected]>
(and also to the JBD2 patch)
Honza
> ---
> fs/jbd/commit.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> Index: linux-2.6.26-rc1/fs/jbd/commit.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/fs/jbd/commit.c 2008-05-09 14:46:25.000000000 -0700
> +++ linux-2.6.26-rc1/fs/jbd/commit.c 2008-05-09 15:11:00.000000000 -0700
> @@ -478,7 +478,9 @@ void journal_commit_transaction(journal_
> * transaction! Now comes the tricky part: we need to write out
> * metadata. Loop over the transaction's entire buffer list:
> */
> + spin_lock(&journal->j_state_lock);
> commit_transaction->t_state = T_COMMIT;
> + spin_unlock(&journal->j_state_lock);
>
> J_ASSERT(commit_transaction->t_nr_buffers <=
> commit_transaction->t_outstanding_credits);
>
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
Hello,
On Fri 09-05-08 15:27:52, Mingming Cao wrote:
> > > I was able to reproduce the customer problem involving DIO
> > > (invalidate_inode_pages2) problem by writing simple testcase
> > > to keep writing to a file using buffered writes and DIO writes
> > > forever in a loop. I see DIO writes fail with -EIO.
> > >
> > > After a long debug, found 2 cases how this could happen.
> > > These are race conditions with journal_try_to_free_buffers()
> > > and journal_commit_transaction().
> > >
> > > 1) journal_submit_data_buffers() tries to get bh_state lock. If
> > > try lock fails, it drops the j_list_lock and sleeps for
> > > bh_state lock, while holding a reference on the buffer.
> > > In the meanwhile, journal_try_to_free_buffers() can clean up the
> > > journal head could call try_to_free_buffers(). try_to_free_buffers()
> > > would fail due to the reference held by journal_submit_data_buffers()
> > > - which in turn causes failues for DIO (invalidate_inode_pages2()).
> > >
> > > 2) When the buffer is on t_locked_list waiting for IO to finish,
> > > we hold a reference and give up the cpu, if we can't get
> > > bh_state lock. This causes try_to_free_buffers() to fail.
> > >
> > > Fix is to drop the reference on the buffer if we can't get
> > > bh_state lock, give up the cpu and re-try the whole operation -
> > > instead of waiting for the vh_state lock.
> > >
> > > Does this look like a resonable fix ?
> > As Mingming pointed out there are few other places where we could hold
> > the bh reference. Note also that we accumulate references to buffers in the
> > wbuf[] list and we need that for submit_bh() which consumes one bh
> > reference. Generally, it seems to me as a too fragile and impractical
> > rule "nobody can hold bh reference when not holding page lock" which is
> > basically what it comes down to if you really want to be sure that
> > journal_try_to_free_buffers() succeeds. And also note that in principle
> > there are other places which hold references to buffers without holding the
> > page lock - for example writepage() in ordered mode (although this one is
> > in practice hardly triggerable). So how we could fix at least the races
> > with commit code is to implement launder_page() callback for ext3/4 which
> > would wait for the previous transaction commit in case the page has buffers
> > that are part of that commit (I don't want this logic in
> > journal_try_to_free_buffers() as that is called also on memory-reclaim
> > path, but journal_launder_page() is fine with me).
>
> I am not sure how we are going to gurantee that by the time
> journal_try_to_free_buffers() get called, the page has buffers are not
> as part of the current transaction commit(which could be different than
> the one we waited in ext3_launder_page())?
Hmm, you are right. It is not enough to just wait in ext3_launder_page()
because we don't have a transaction for direct_IO started yet. But if we
actually released buffers from the page there, it should be fine.
> It seems more realistic to fix the races one by one to me.
Not to me, really. The scheme for buffer references you are trying to
impose is awkward to say the least. First, it is completely
counter-intuitive (at least to me ;), second, it is impractical as well.
For example in your scheme, you have no sensible way of locking ordered
data mode buffer - you cannot just do: get the reference and do
lock_buffer() because that violates your requirements. The only reasonable
way you could do that is to lock the page to make sure buffer won't go away
from you - but you cannot currently do that in journal commit code because
of lock ordering. So the only way I can see which is left is: get some jbd
spin lock to serialize with journal_try_to_free_buffers(), get the buffer
reference, try to lock buffer, if it fails, drop everything and restart.
And this is IMO no-go...
And BTW even if you fix such races, I think you'll still have races like:
CPU1: CPU2:
filemap_write_and_wait()
dirty a page
msync() (dirties buffers)
invalidate_inode_page2_range() -> -EIO
The code could historically always return EIO when mixing buffered and
unbuffered accesses and the question is, under which circumstances is this
acceptable? I agree that the current state when if you do "buffered write,
DIO write" in sequence and you'll possibly get EIO is bad and we should fix
it. But I'm not sure we should fix the EIO return under all possible
circumstances at all costs...
> There is still a window that journal_submit_data_buffers() already
> removed the jh from the bh (when found the buffers are already being
> synced), but still keep a reference to the buffer head.
> journal_try_to_free_buffers() could be called. In that case
> try_to_free_buffers() will be called since there is no jh related to
> this buffer, and failed due to journal_submit_data_buffers() hasn't
> finish the cleanup business yet.
>
> For this new race, we could just grab the j_list_lock when re-try
> try_to_free_buffers() to force waiting for journal_commit_transaction()
> to finish it flush work. But not sure if this is acceptable approach?
>
> Patch like this? Comments?
>
> ---------------------------------------------------------------------
> There are a few cases direct IO could race with kjournal flushing
> data buffers which could result direct IO return EIO error.
>
> 1) journal_submit_data_buffers() tries to get bh_state lock. If
> try lock fails, it drops the j_list_lock and sleeps for
> bh_state lock, while holding a reference on the buffer.
> In the meanwhile, journal_try_to_free_buffers() can clean up the
> journal head could call try_to_free_buffers(). try_to_free_buffers()
> would fail due to the reference held by journal_submit_data_buffers()
> - which in turn causes failues for DIO (invalidate_inode_pages2()).
>
> 2) When the buffer is on t_locked_list waiting for IO to finish,
> we hold a reference and give up the cpu, if we can't get
> bh_state lock. This causes try_to_free_buffers() to fail.
>
> 3) when journal_submit_data_buffers() saw the buffer is dirty but failed
> to lock the buffer bh1, journal_submit_data_buffers() released the
> j_list_lock and submit other buffers collected from previous check, with
> the reference to bh1 still hold. During this time
> journal_try_to_free_buffers() could clean up the journal head of bh1 and
> remove it from the t_syncdata_list. Then try_to_free_buffers() would
> fail because the reference held by journal_submit_data_buffers()
>
> 4) journal_submit_data_buffers() already removed the jh from the bh
> (when found the buffers are already being synced), but still keep a
> reference to the buffer head. journal_try_to_free_buffers() could be
> called. In that case try_to_free_buffers() will be called since there is
> no jh related to this buffer, and failed due to
> journal_submit_data_buffers() hasn't finish the cleanup business yet.
>
> Fix for first three races is to drop the reference on the buffer head
> when release the j_list_lock,
> give up the cpu and re-try the whole operation.
>
> This patch also fixes the race that data buffers has been
> flushed to disk and journal head is cleard
> by journal_submit_data_buffers() but did not get a chance to release
> buffer head reference before the journal_try_to_free_buffers() kicked in.
>
>
> Signed-off-by: Badari Pulavarty <[email protected]>
> Signed-off-by: Mingming Cao <[email protected]>
> ---
> fs/jbd/commit.c | 21 ++++++++++++++++-----
> fs/jbd/transaction.c | 13 +++++++++++++
> 2 files changed, 29 insertions(+), 5 deletions(-)
>
> Index: linux-2.6.26-rc1/fs/jbd/commit.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/fs/jbd/commit.c 2008-05-03 11:59:44.000000000 -0700
> +++ linux-2.6.26-rc1/fs/jbd/commit.c 2008-05-09 14:44:36.000000000 -0700
> @@ -79,12 +79,16 @@ nope:
>
> /*
> * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
> - * held. For ranking reasons we must trylock. If we lose, schedule away and
> + * held. For ranking reasons we must trylock. If we lose, unlock the buffer
> + * if needed, drop the reference on the buffer, schedule away and
> * return 0. j_list_lock is dropped in this case.
> */
> -static int inverted_lock(journal_t *journal, struct buffer_head *bh)
> +static int inverted_lock(journal_t *journal, struct buffer_head *bh, int locked)
> {
> if (!jbd_trylock_bh_state(bh)) {
> + if (locked)
> + unlock_buffer(bh);
> + put_bh(bh);
> spin_unlock(&journal->j_list_lock);
> schedule();
> return 0;
> @@ -209,19 +213,24 @@ write_out_data:
> if (buffer_dirty(bh)) {
> if (test_set_buffer_locked(bh)) {
> BUFFER_TRACE(bh, "needs blocking lock");
> + put_bh(bh);
> spin_unlock(&journal->j_list_lock);
> /* Write out all data to prevent deadlocks */
> journal_do_submit_data(wbuf, bufs);
> bufs = 0;
> - lock_buffer(bh);
> spin_lock(&journal->j_list_lock);
> + continue;
^^^ Here you can see what I wrote above. Basically you just busy-loop
wait for buffer lock. You should at least put schedule() there so that you
don't lockup the CPU but it's ugly anyway.
> }
> locked = 1;
> }
> - /* We have to get bh_state lock. Again out of order, sigh. */
> - if (!inverted_lock(journal, bh)) {
> - jbd_lock_bh_state(bh);
> + /*
> + * We have to get bh_state lock. If the try lock fails,
> + * release the ref on the buffer, give up cpu and retry the
> + * whole operation.
> + */
> + if (!inverted_lock(journal, bh, locked)) {
> spin_lock(&journal->j_list_lock);
> + continue;
> }
^^^ And here you add a place where we are not guaranteed to make any
progress... If someone intensively spins on that buffer, commit code could
cycle here forever (or at least for quite a long time).
> /* Someone already cleaned up the buffer? */
> if (!buffer_jbd(bh)
> @@ -430,8 +439,7 @@ void journal_commit_transaction(journal_
> err = -EIO;
> spin_lock(&journal->j_list_lock);
> }
> - if (!inverted_lock(journal, bh)) {
> - put_bh(bh);
> + if (!inverted_lock(journal, bh, 0)) {
> spin_lock(&journal->j_list_lock);
> continue;
> }
> Index: linux-2.6.26-rc1/fs/jbd/transaction.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/fs/jbd/transaction.c 2008-05-03 11:59:44.000000000 -0700
> +++ linux-2.6.26-rc1/fs/jbd/transaction.c 2008-05-09 09:53:57.000000000 -0700
> @@ -1714,6 +1714,19 @@ int journal_try_to_free_buffers(journal_
> goto busy;
> } while ((bh = bh->b_this_page) != head);
> ret = try_to_free_buffers(page);
> + if (ret == 0) {
> + /*
> + * it is possible that journal_submit_data_buffers()
> + * still holds the bh ref even if clears the jh
> + * after journal_remove_journal_head,
> + * which leads to try_to_free_buffers() failed
> + * let's wait for journal_submit_data_buffers()
> + * to finishing remove the bh from the sync_data_list
> + */
> + spin_lock(&journal->j_list_lock);
> + ret = try_to_free_buffers(page);
> + spin_unlock(&journal->j_list_lock);
> + }
> busy:
> return ret;
> }
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mon, 2008-05-12 at 17:54 +0200, Jan Kara wrote:
> Hello,
>
> On Fri 09-05-08 15:27:52, Mingming Cao wrote:
> > > > I was able to reproduce the customer problem involving DIO
> > > > (invalidate_inode_pages2) problem by writing simple testcase
> > > > to keep writing to a file using buffered writes and DIO writes
> > > > forever in a loop. I see DIO writes fail with -EIO.
> > > >
> > > > After a long debug, found 2 cases how this could happen.
> > > > These are race conditions with journal_try_to_free_buffers()
> > > > and journal_commit_transaction().
> > > >
> > > > 1) journal_submit_data_buffers() tries to get bh_state lock. If
> > > > try lock fails, it drops the j_list_lock and sleeps for
> > > > bh_state lock, while holding a reference on the buffer.
> > > > In the meanwhile, journal_try_to_free_buffers() can clean up the
> > > > journal head could call try_to_free_buffers(). try_to_free_buffers()
> > > > would fail due to the reference held by journal_submit_data_buffers()
> > > > - which in turn causes failues for DIO (invalidate_inode_pages2()).
> > > >
> > > > 2) When the buffer is on t_locked_list waiting for IO to finish,
> > > > we hold a reference and give up the cpu, if we can't get
> > > > bh_state lock. This causes try_to_free_buffers() to fail.
> > > >
> > > > Fix is to drop the reference on the buffer if we can't get
> > > > bh_state lock, give up the cpu and re-try the whole operation -
> > > > instead of waiting for the vh_state lock.
> > > >
> > > > Does this look like a resonable fix ?
> > > As Mingming pointed out there are few other places where we could hold
> > > the bh reference. Note also that we accumulate references to buffers in the
> > > wbuf[] list and we need that for submit_bh() which consumes one bh
> > > reference. Generally, it seems to me as a too fragile and impractical
> > > rule "nobody can hold bh reference when not holding page lock" which is
> > > basically what it comes down to if you really want to be sure that
> > > journal_try_to_free_buffers() succeeds. And also note that in principle
> > > there are other places which hold references to buffers without holding the
> > > page lock - for example writepage() in ordered mode (although this one is
> > > in practice hardly triggerable). So how we could fix at least the races
> > > with commit code is to implement launder_page() callback for ext3/4 which
> > > would wait for the previous transaction commit in case the page has buffers
> > > that are part of that commit (I don't want this logic in
> > > journal_try_to_free_buffers() as that is called also on memory-reclaim
> > > path, but journal_launder_page() is fine with me).
> >
> > I am not sure how we are going to gurantee that by the time
> > journal_try_to_free_buffers() get called, the page has buffers are not
> > as part of the current transaction commit(which could be different than
> > the one we waited in ext3_launder_page())?
> Hmm, you are right. It is not enough to just wait in ext3_launder_page()
> because we don't have a transaction for direct_IO started yet. But if we
> actually released buffers from the page there, it should be fine.
>
Do you mean calling journal_try_to_free_buffers() inside
ext3_launder_page()? I think we still need some lock to serialize
launder_page() with kjournald commit code(not sure if is that Okay?),
otherwise there is always a window that by the time
try_to_free_buffers() get called, the current transaction has be
changed...
> > It seems more realistic to fix the races one by one to me.
> Not to me, really. The scheme for buffer references you are trying to
> impose is awkward to say the least. First, it is completely
> counter-intuitive (at least to me ;), second, it is impractical as well.
Sigh...I am not very happy with the solution either, but I could not see
a decent solution that could fix this problem. Currently we constantly
hit EIO error only in 10 minutes with the simple parallel buffered IO
and direct IO:(...
> For example in your scheme, you have no sensible way of locking ordered
> data mode buffer - you cannot just do: get the reference and do
> lock_buffer() because that violates your requirements. The only reasonable
> way you could do that is to lock the page to make sure buffer won't go away
> from you - but you cannot currently do that in journal commit code because
> of lock ordering. So the only way I can see which is left is: get some jbd
> spin lock to serialize with journal_try_to_free_buffers(), get the buffer
> reference, try to lock buffer, if it fails, drop everything and restart.
> And this is IMO no-go...
> And BTW even if you fix such races, I think you'll still have races like:
> CPU1: CPU2:
> filemap_write_and_wait()
> dirty a page
> msync() (dirties buffers)
> invalidate_inode_page2_range() -> -EIO
>
I could see this is possible with mapped IO. But for buffered IO, since
direct IO is holding a i_mutex, this case should not happen,right?
> The code could historically always return EIO when mixing buffered and
> unbuffered accesses and the question is, under which circumstances is this
> acceptable? I agree that the current state when if you do "buffered write,
> DIO write" in sequence and you'll possibly get EIO is bad and we should fix
> it. But I'm not sure we should fix the EIO return under all possible
> circumstances at all costs...
>
> > There is still a window that journal_submit_data_buffers() already
> > removed the jh from the bh (when found the buffers are already being
> > synced), but still keep a reference to the buffer head.
> > journal_try_to_free_buffers() could be called. In that case
> > try_to_free_buffers() will be called since there is no jh related to
> > this buffer, and failed due to journal_submit_data_buffers() hasn't
> > finish the cleanup business yet.
> >
> > For this new race, we could just grab the j_list_lock when re-try
> > try_to_free_buffers() to force waiting for journal_commit_transaction()
> > to finish it flush work. But not sure if this is acceptable approach?
> >
> > Patch like this? Comments?
> >
> > ---------------------------------------------------------------------
> > There are a few cases direct IO could race with kjournal flushing
> > data buffers which could result direct IO return EIO error.
> >
> > 1) journal_submit_data_buffers() tries to get bh_state lock. If
> > try lock fails, it drops the j_list_lock and sleeps for
> > bh_state lock, while holding a reference on the buffer.
> > In the meanwhile, journal_try_to_free_buffers() can clean up the
> > journal head could call try_to_free_buffers(). try_to_free_buffers()
> > would fail due to the reference held by journal_submit_data_buffers()
> > - which in turn causes failues for DIO (invalidate_inode_pages2()).
> >
> > 2) When the buffer is on t_locked_list waiting for IO to finish,
> > we hold a reference and give up the cpu, if we can't get
> > bh_state lock. This causes try_to_free_buffers() to fail.
> >
> > 3) when journal_submit_data_buffers() saw the buffer is dirty but failed
> > to lock the buffer bh1, journal_submit_data_buffers() released the
> > j_list_lock and submit other buffers collected from previous check, with
> > the reference to bh1 still hold. During this time
> > journal_try_to_free_buffers() could clean up the journal head of bh1 and
> > remove it from the t_syncdata_list. Then try_to_free_buffers() would
> > fail because the reference held by journal_submit_data_buffers()
> >
> > 4) journal_submit_data_buffers() already removed the jh from the bh
> > (when found the buffers are already being synced), but still keep a
> > reference to the buffer head. journal_try_to_free_buffers() could be
> > called. In that case try_to_free_buffers() will be called since there is
> > no jh related to this buffer, and failed due to
> > journal_submit_data_buffers() hasn't finish the cleanup business yet.
> >
> > Fix for first three races is to drop the reference on the buffer head
> > when release the j_list_lock,
> > give up the cpu and re-try the whole operation.
> >
> > This patch also fixes the race that data buffers has been
> > flushed to disk and journal head is cleard
> > by journal_submit_data_buffers() but did not get a chance to release
> > buffer head reference before the journal_try_to_free_buffers() kicked in.
> >
> >
> > Signed-off-by: Badari Pulavarty <[email protected]>
> > Signed-off-by: Mingming Cao <[email protected]>
> > ---
> > fs/jbd/commit.c | 21 ++++++++++++++++-----
> > fs/jbd/transaction.c | 13 +++++++++++++
> > 2 files changed, 29 insertions(+), 5 deletions(-)
> >
> > Index: linux-2.6.26-rc1/fs/jbd/commit.c
> > ===================================================================
> > --- linux-2.6.26-rc1.orig/fs/jbd/commit.c 2008-05-03 11:59:44.000000000 -0700
> > +++ linux-2.6.26-rc1/fs/jbd/commit.c 2008-05-09 14:44:36.000000000 -0700
> > @@ -79,12 +79,16 @@ nope:
> >
> > /*
> > * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
> > - * held. For ranking reasons we must trylock. If we lose, schedule away and
> > + * held. For ranking reasons we must trylock. If we lose, unlock the buffer
> > + * if needed, drop the reference on the buffer, schedule away and
> > * return 0. j_list_lock is dropped in this case.
> > */
> > -static int inverted_lock(journal_t *journal, struct buffer_head *bh)
> > +static int inverted_lock(journal_t *journal, struct buffer_head *bh, int locked)
> > {
> > if (!jbd_trylock_bh_state(bh)) {
> > + if (locked)
> > + unlock_buffer(bh);
> > + put_bh(bh);
> > spin_unlock(&journal->j_list_lock);
> > schedule();
> > return 0;
> > @@ -209,19 +213,24 @@ write_out_data:
> > if (buffer_dirty(bh)) {
> > if (test_set_buffer_locked(bh)) {
> > BUFFER_TRACE(bh, "needs blocking lock");
> > + put_bh(bh);
> > spin_unlock(&journal->j_list_lock);
> > /* Write out all data to prevent deadlocks */
> > journal_do_submit_data(wbuf, bufs);
> > bufs = 0;
> > - lock_buffer(bh);
> > spin_lock(&journal->j_list_lock);
> > + continue;
> ^^^ Here you can see what I wrote above. Basically you just busy-loop
> wait for buffer lock. You should at least put schedule() there so that you
> don't lockup the CPU but it's ugly anyway.
>
Yup.
The conflict is that if we still held the bh refrence after released the
j_list_lock, journal_try_to_free_buffers() could came in returns EIO to
direct IO since buffer is busy(); but if we release the bh reference
after released the j_list_lock, it made possible that
journal_try_to_free_buffers() to free that buffer, so we can't do
lock_buffer() here anymore so we have to loop here. This is a trade
off ...
On the other hand, the journal_submit_data_bufferes() continue process
this buffer after regrab the j_list_lock even if it's has been removed
from the t_syncdata_list by __journal_try_to_free_buffers(). IMO this is
not the optimized way.
> > }
> > locked = 1;
> > }
> > - /* We have to get bh_state lock. Again out of order, sigh. */
> > - if (!inverted_lock(journal, bh)) {
> > - jbd_lock_bh_state(bh);
> > + /*
> > + * We have to get bh_state lock. If the try lock fails,
> > + * release the ref on the buffer, give up cpu and retry the
> > + * whole operation.
> > + */
> > + if (!inverted_lock(journal, bh, locked)) {
> > spin_lock(&journal->j_list_lock);
> > + continue;
> > }
> ^^^ And here you add a place where we are not guaranteed to make any
> progress... If someone intensively spins on that buffer, commit code could
> cycle here forever (or at least for quite a long time).
>
> > /* Someone already cleaned up the buffer? */
> > if (!buffer_jbd(bh)
> > @@ -430,8 +439,7 @@ void journal_commit_transaction(journal_
> > err = -EIO;
> > spin_lock(&journal->j_list_lock);
> > }
> > - if (!inverted_lock(journal, bh)) {
> > - put_bh(bh);
> > + if (!inverted_lock(journal, bh, 0)) {
> > spin_lock(&journal->j_list_lock);
> > continue;
> > }
> > Index: linux-2.6.26-rc1/fs/jbd/transaction.c
> > ===================================================================
> > --- linux-2.6.26-rc1.orig/fs/jbd/transaction.c 2008-05-03 11:59:44.000000000 -0700
> > +++ linux-2.6.26-rc1/fs/jbd/transaction.c 2008-05-09 09:53:57.000000000 -0700
> > @@ -1714,6 +1714,19 @@ int journal_try_to_free_buffers(journal_
> > goto busy;
> > } while ((bh = bh->b_this_page) != head);
> > ret = try_to_free_buffers(page);
> > + if (ret == 0) {
> > + /*
> > + * it is possible that journal_submit_data_buffers()
> > + * still holds the bh ref even if clears the jh
> > + * after journal_remove_journal_head,
> > + * which leads to try_to_free_buffers() failed
> > + * let's wait for journal_submit_data_buffers()
> > + * to finishing remove the bh from the sync_data_list
> > + */
> > + spin_lock(&journal->j_list_lock);
> > + ret = try_to_free_buffers(page);
> > + spin_unlock(&journal->j_list_lock);
> > + }
> > busy:
> > return ret;
> > }
>
> Honza
On Mon, 2008-05-12 at 17:54 +0200, Jan Kara wrote:
> Hello,
>
> On Fri 09-05-08 15:27:52, Mingming Cao wrote:
> > > > I was able to reproduce the customer problem involving DIO
> > > > (invalidate_inode_pages2) problem by writing simple testcase
> > > > to keep writing to a file using buffered writes and DIO writes
> > > > forever in a loop. I see DIO writes fail with -EIO.
> > > >
> > > > After a long debug, found 2 cases how this could happen.
> > > > These are race conditions with journal_try_to_free_buffers()
> > > > and journal_commit_transaction().
> > > >
> > > > 1) journal_submit_data_buffers() tries to get bh_state lock. If
> > > > try lock fails, it drops the j_list_lock and sleeps for
> > > > bh_state lock, while holding a reference on the buffer.
> > > > In the meanwhile, journal_try_to_free_buffers() can clean up the
> > > > journal head could call try_to_free_buffers(). try_to_free_buffers()
> > > > would fail due to the reference held by journal_submit_data_buffers()
> > > > - which in turn causes failues for DIO (invalidate_inode_pages2()).
> > > >
> > > > 2) When the buffer is on t_locked_list waiting for IO to finish,
> > > > we hold a reference and give up the cpu, if we can't get
> > > > bh_state lock. This causes try_to_free_buffers() to fail.
> > > >
> > > > Fix is to drop the reference on the buffer if we can't get
> > > > bh_state lock, give up the cpu and re-try the whole operation -
> > > > instead of waiting for the vh_state lock.
> > > >
> > > > Does this look like a resonable fix ?
> > > As Mingming pointed out there are few other places where we could hold
> > > the bh reference. Note also that we accumulate references to buffers in the
> > > wbuf[] list and we need that for submit_bh() which consumes one bh
> > > reference. Generally, it seems to me as a too fragile and impractical
> > > rule "nobody can hold bh reference when not holding page lock" which is
> > > basically what it comes down to if you really want to be sure that
> > > journal_try_to_free_buffers() succeeds. And also note that in principle
> > > there are other places which hold references to buffers without holding the
> > > page lock - for example writepage() in ordered mode (although this one is
> > > in practice hardly triggerable). So how we could fix at least the races
> > > with commit code is to implement launder_page() callback for ext3/4 which
> > > would wait for the previous transaction commit in case the page has buffers
> > > that are part of that commit (I don't want this logic in
> > > journal_try_to_free_buffers() as that is called also on memory-reclaim
> > > path, but journal_launder_page() is fine with me).
> >
> > I am not sure how we are going to gurantee that by the time
> > journal_try_to_free_buffers() get called, the page has buffers are not
> > as part of the current transaction commit(which could be different than
> > the one we waited in ext3_launder_page())?
> Hmm, you are right. It is not enough to just wait in ext3_launder_page()
> because we don't have a transaction for direct_IO started yet. But if we
> actually released buffers from the page there, it should be fine.
Does this match what you are thinking? It certainly slow down the DIO
path, but the positive side is it doesn't disturb the other code path...
thanks for your feedback!
--------------------------------------------
An unexpected EIO error gets returned when writing to a file
using buffered writes and DIO writes at the same time.
We found there are a number of places where journal_try_to_free_buffers()
could race with journal_commit_transaction(), the later still
helds the reference to the buffers on the t_syncdata_list or t_locked_list
, while journal_try_to_free_buffers() tries to free them, which resulting an EIO
error returns back to the dio caller.
The logic fix is to retry freeing if journal_try_to_free_buffers() to failed
to free those data buffers while journal_commit_transaction() is still
reference those buffers.
This is done via implement ext3 launder_page() callback, instead of inside
journal_try_to_free_buffers() itself, so that it doesn't affecting other code
path calling journal_try_to_free_buffers and only dio path get affected.
Signed-off-by: Mingming Cao <[email protected]>
Index: linux-2.6.26-rc1/fs/ext3/inode.c
===================================================================
--- linux-2.6.26-rc1.orig/fs/ext3/inode.c 2008-05-03 11:59:44.000000000 -0700
+++ linux-2.6.26-rc1/fs/ext3/inode.c 2008-05-12 12:41:27.000000000 -0700
@@ -1766,6 +1766,23 @@ static int ext3_journalled_set_page_dirt
return __set_page_dirty_nobuffers(page);
}
+static int ext3_launder_page(struct page *page)
+{
+ int ret;
+ int retry = 5;
+
+ while (retry --) {
+ ret = ext3_releasepage(page, GFP_KERNEL);
+ if (ret == 1)
+ break;
+ else
+ schedule();
+ }
+
+ return ret;
+}
+
+
static const struct address_space_operations ext3_ordered_aops = {
.readpage = ext3_readpage,
.readpages = ext3_readpages,
@@ -1778,6 +1795,7 @@ static const struct address_space_operat
.releasepage = ext3_releasepage,
.direct_IO = ext3_direct_IO,
.migratepage = buffer_migrate_page,
+ .launder_page = ext3_launder_page,
};
static const struct address_space_operations ext3_writeback_aops = {
@@ -1792,6 +1810,7 @@ static const struct address_space_operat
.releasepage = ext3_releasepage,
.direct_IO = ext3_direct_IO,
.migratepage = buffer_migrate_page,
+ .launder_page = ext3_launder_page,
};
static const struct address_space_operations ext3_journalled_aops = {
@@ -1805,6 +1824,7 @@ static const struct address_space_operat
.bmap = ext3_bmap,
.invalidatepage = ext3_invalidatepage,
.releasepage = ext3_releasepage,
+ .launder_page = ext3_launder_page,
};
void ext3_set_aops(struct inode *inode)
On Mon 12-05-08 12:23:26, Mingming Cao wrote:
> On Mon, 2008-05-12 at 17:54 +0200, Jan Kara wrote:
> > On Fri 09-05-08 15:27:52, Mingming Cao wrote:
> > > > > I was able to reproduce the customer problem involving DIO
> > > > > (invalidate_inode_pages2) problem by writing simple testcase
> > > > > to keep writing to a file using buffered writes and DIO writes
> > > > > forever in a loop. I see DIO writes fail with -EIO.
> > > > >
> > > > > After a long debug, found 2 cases how this could happen.
> > > > > These are race conditions with journal_try_to_free_buffers()
> > > > > and journal_commit_transaction().
> > > > >
> > > > > 1) journal_submit_data_buffers() tries to get bh_state lock. If
> > > > > try lock fails, it drops the j_list_lock and sleeps for
> > > > > bh_state lock, while holding a reference on the buffer.
> > > > > In the meanwhile, journal_try_to_free_buffers() can clean up the
> > > > > journal head could call try_to_free_buffers(). try_to_free_buffers()
> > > > > would fail due to the reference held by journal_submit_data_buffers()
> > > > > - which in turn causes failues for DIO (invalidate_inode_pages2()).
> > > > >
> > > > > 2) When the buffer is on t_locked_list waiting for IO to finish,
> > > > > we hold a reference and give up the cpu, if we can't get
> > > > > bh_state lock. This causes try_to_free_buffers() to fail.
> > > > >
> > > > > Fix is to drop the reference on the buffer if we can't get
> > > > > bh_state lock, give up the cpu and re-try the whole operation -
> > > > > instead of waiting for the vh_state lock.
> > > > >
> > > > > Does this look like a resonable fix ?
> > > > As Mingming pointed out there are few other places where we could hold
> > > > the bh reference. Note also that we accumulate references to buffers in the
> > > > wbuf[] list and we need that for submit_bh() which consumes one bh
> > > > reference. Generally, it seems to me as a too fragile and impractical
> > > > rule "nobody can hold bh reference when not holding page lock" which is
> > > > basically what it comes down to if you really want to be sure that
> > > > journal_try_to_free_buffers() succeeds. And also note that in principle
> > > > there are other places which hold references to buffers without holding the
> > > > page lock - for example writepage() in ordered mode (although this one is
> > > > in practice hardly triggerable). So how we could fix at least the races
> > > > with commit code is to implement launder_page() callback for ext3/4 which
> > > > would wait for the previous transaction commit in case the page has buffers
> > > > that are part of that commit (I don't want this logic in
> > > > journal_try_to_free_buffers() as that is called also on memory-reclaim
> > > > path, but journal_launder_page() is fine with me).
> > >
> > > I am not sure how we are going to gurantee that by the time
> > > journal_try_to_free_buffers() get called, the page has buffers are not
> > > as part of the current transaction commit(which could be different than
> > > the one we waited in ext3_launder_page())?
> > Hmm, you are right. It is not enough to just wait in ext3_launder_page()
> > because we don't have a transaction for direct_IO started yet. But if we
> > actually released buffers from the page there, it should be fine.
> >
>
> Do you mean calling journal_try_to_free_buffers() inside
> ext3_launder_page()? I think we still need some lock to serialize
> launder_page() with kjournald commit code(not sure if is that Okay?),
> otherwise there is always a window that by the time
> try_to_free_buffers() get called, the current transaction has be
> changed...
Once we succeed with removing journal heads from data buffers in
journal_try_to_free_buffers(), they can be added only if someone dirties
the page via mmap and writepage is called (which I'd currently neglect as
too rare and not common usage). So after that moment, once commit code
drops references to buffer heads, we can happily remove those buffers.
> > > It seems more realistic to fix the races one by one to me.
> > Not to me, really. The scheme for buffer references you are trying to
> > impose is awkward to say the least. First, it is completely
> > counter-intuitive (at least to me ;), second, it is impractical as well.
>
> Sigh...I am not very happy with the solution either, but I could not see
> a decent solution that could fix this problem. Currently we constantly
> hit EIO error only in 10 minutes with the simple parallel buffered IO
> and direct IO:(...
Yes, buffered and unbuffered writes should happily live together...
> > For example in your scheme, you have no sensible way of locking ordered
> > data mode buffer - you cannot just do: get the reference and do
> > lock_buffer() because that violates your requirements. The only reasonable
> > way you could do that is to lock the page to make sure buffer won't go away
> > from you - but you cannot currently do that in journal commit code because
> > of lock ordering. So the only way I can see which is left is: get some jbd
> > spin lock to serialize with journal_try_to_free_buffers(), get the buffer
> > reference, try to lock buffer, if it fails, drop everything and restart.
> > And this is IMO no-go...
> > And BTW even if you fix such races, I think you'll still have races like:
> > CPU1: CPU2:
> > filemap_write_and_wait()
> > dirty a page
> > msync() (dirties buffers)
> > invalidate_inode_page2_range() -> -EIO
> >
>
> I could see this is possible with mapped IO. But for buffered IO, since
> direct IO is holding a i_mutex, this case should not happen,right?
Yes.
> > > @@ -209,19 +213,24 @@ write_out_data:
> > > if (buffer_dirty(bh)) {
> > > if (test_set_buffer_locked(bh)) {
> > > BUFFER_TRACE(bh, "needs blocking lock");
> > > + put_bh(bh);
> > > spin_unlock(&journal->j_list_lock);
> > > /* Write out all data to prevent deadlocks */
> > > journal_do_submit_data(wbuf, bufs);
> > > bufs = 0;
> > > - lock_buffer(bh);
> > > spin_lock(&journal->j_list_lock);
> > > + continue;
> > ^^^ Here you can see what I wrote above. Basically you just busy-loop
> > wait for buffer lock. You should at least put schedule() there so that you
> > don't lockup the CPU but it's ugly anyway.
> >
> Yup.
>
> The conflict is that if we still held the bh refrence after released the
> j_list_lock, journal_try_to_free_buffers() could came in returns EIO to
> direct IO since buffer is busy(); but if we release the bh reference
> after released the j_list_lock, it made possible that
> journal_try_to_free_buffers() to free that buffer, so we can't do
> lock_buffer() here anymore so we have to loop here. This is a trade
> off ...
>
> On the other hand, the journal_submit_data_bufferes() continue process
> this buffer after regrab the j_list_lock even if it's has been removed
> from the t_syncdata_list by __journal_try_to_free_buffers(). IMO this is
> not the optimized way.
Well, you are going to hit this path for example when the buffer is under
IO just now. So it happens quite often e.g. when the machine is under
memory pressure and writes-out dirty data more aggressively.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mon 12-05-08 17:39:43, Mingming Cao wrote:
> On Mon, 2008-05-12 at 17:54 +0200, Jan Kara wrote:
> Does this match what you are thinking? It certainly slow down the DIO
> path, but the positive side is it doesn't disturb the other code path...
> thanks for your feedback!
>
> --------------------------------------------
>
> An unexpected EIO error gets returned when writing to a file
> using buffered writes and DIO writes at the same time.
>
> We found there are a number of places where journal_try_to_free_buffers()
> could race with journal_commit_transaction(), the later still
> helds the reference to the buffers on the t_syncdata_list or t_locked_list
> , while journal_try_to_free_buffers() tries to free them, which resulting an EIO
> error returns back to the dio caller.
>
> The logic fix is to retry freeing if journal_try_to_free_buffers() to failed
> to free those data buffers while journal_commit_transaction() is still
> reference those buffers.
> This is done via implement ext3 launder_page() callback, instead of inside
> journal_try_to_free_buffers() itself, so that it doesn't affecting other code
> path calling journal_try_to_free_buffers and only dio path get affected.
>
> Signed-off-by: Mingming Cao <[email protected]>
> Index: linux-2.6.26-rc1/fs/ext3/inode.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/fs/ext3/inode.c 2008-05-03 11:59:44.000000000 -0700
> +++ linux-2.6.26-rc1/fs/ext3/inode.c 2008-05-12 12:41:27.000000000 -0700
> @@ -1766,6 +1766,23 @@ static int ext3_journalled_set_page_dirt
> return __set_page_dirty_nobuffers(page);
> }
>
> +static int ext3_launder_page(struct page *page)
> +{
> + int ret;
> + int retry = 5;
> +
> + while (retry --) {
> + ret = ext3_releasepage(page, GFP_KERNEL);
> + if (ret == 1)
> + break;
> + else
> + schedule();
> + }
> +
> + return ret;
> +}
> +
> +
Yes, I meant something like this. We could be more clever and do:
head = bh = page_buffers(page);
do {
wait_on_buffer(bh);
bh = bh->b_this_page;
} while (bh != head);
/*
* Now commit code should have been able to proceed and release
* those buffers
*/
schedule();
or we could do simple:
log_wait_commit(...);
That would impose larger perf. penalty but on the other hand you shouldn't
hit this path too often. But maybe the code above would be fine and would
handle most cases. Also please add a big comment to that function to explain
why this magic is needed.
> static const struct address_space_operations ext3_ordered_aops = {
> .readpage = ext3_readpage,
> .readpages = ext3_readpages,
> @@ -1778,6 +1795,7 @@ static const struct address_space_operat
> .releasepage = ext3_releasepage,
> .direct_IO = ext3_direct_IO,
> .migratepage = buffer_migrate_page,
> + .launder_page = ext3_launder_page,
> };
>
> static const struct address_space_operations ext3_writeback_aops = {
> @@ -1792,6 +1810,7 @@ static const struct address_space_operat
> .releasepage = ext3_releasepage,
> .direct_IO = ext3_direct_IO,
> .migratepage = buffer_migrate_page,
> + .launder_page = ext3_launder_page,
> };
>
> static const struct address_space_operations ext3_journalled_aops = {
> @@ -1805,6 +1824,7 @@ static const struct address_space_operat
> .bmap = ext3_bmap,
> .invalidatepage = ext3_invalidatepage,
> .releasepage = ext3_releasepage,
> + .launder_page = ext3_launder_page,
> };
>
> void ext3_set_aops(struct inode *inode)
Actually, we need .launder_page callback only in data=order mode.
data=writeback mode doesn't need it at all (journal code doesn't touch data
buffers there) and for data=journal mode DIO could have never worked
reasonably when mixed with buffered IO and it would have to do a different
and much more expensive trickery (like flushing the journal, or at least
forcing current transaction to commit).
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Tue, 2008-05-13 at 16:54 +0200, Jan Kara wrote:
> On Mon 12-05-08 17:39:43, Mingming Cao wrote:
> > Index: linux-2.6.26-rc1/fs/ext3/inode.c
> > ===================================================================
> > --- linux-2.6.26-rc1.orig/fs/ext3/inode.c 2008-05-03 11:59:44.000000000 -0700
> > +++ linux-2.6.26-rc1/fs/ext3/inode.c 2008-05-12 12:41:27.000000000 -0700
> > @@ -1766,6 +1766,23 @@ static int ext3_journalled_set_page_dirt
> > return __set_page_dirty_nobuffers(page);
> > }
> >
> > +static int ext3_launder_page(struct page *page)
> > +{
> > + int ret;
> > + int retry = 5;
> > +
> > + while (retry --) {
> > + ret = ext3_releasepage(page, GFP_KERNEL);
> > + if (ret == 1)
> > + break;
> > + else
> > + schedule();
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +
> Yes, I meant something like this. We could be more clever and do:
>
> head = bh = page_buffers(page);
> do {
> wait_on_buffer(bh);
> bh = bh->b_this_page;
> } while (bh != head);
> /*
> * Now commit code should have been able to proceed and release
> * those buffers
> */
> schedule();
>
Thanks.
We could recheck if buffer_busy() before calling wait_on_buffer(bh) to
wait for buffer unlocked. This will handles the mapped IO re-dirty race
case, but still need the schedule() and retry to handle the buffered IO
race.
>
> or we could do simple:
> log_wait_commit(...);
>
> That would impose larger perf. penalty but on the other hand you shouldn't
> hit this path too often.
My concern with doing log_wait_commit() here is the perf penalty. In the
case the buffers is at the end of the queue to commit, we have to wait
for all other previous transactions to finish committing before we could
continue...
> But maybe the code above would be fine and would
> handle most cases. Also please add a big comment to that function to explain
> why this magic is needed.
>
Will do.
> > static const struct address_space_operations ext3_ordered_aops = {
> > .readpage = ext3_readpage,
> > .readpages = ext3_readpages,
> > @@ -1778,6 +1795,7 @@ static const struct address_space_operat
> > .releasepage = ext3_releasepage,
> > .direct_IO = ext3_direct_IO,
> > .migratepage = buffer_migrate_page,
> > + .launder_page = ext3_launder_page,
> > };
> >
> > static const struct address_space_operations ext3_writeback_aops = {
> > @@ -1792,6 +1810,7 @@ static const struct address_space_operat
> > .releasepage = ext3_releasepage,
> > .direct_IO = ext3_direct_IO,
> > .migratepage = buffer_migrate_page,
> > + .launder_page = ext3_launder_page,
> > };
> >
> > static const struct address_space_operations ext3_journalled_aops = {
> > @@ -1805,6 +1824,7 @@ static const struct address_space_operat
> > .bmap = ext3_bmap,
> > .invalidatepage = ext3_invalidatepage,
> > .releasepage = ext3_releasepage,
> > + .launder_page = ext3_launder_page,
> > };
> >
> > void ext3_set_aops(struct inode *inode)
> Actually, we need .launder_page callback only in data=order mode.
> data=writeback mode doesn't need it at all (journal code doesn't touch data
> buffers there) and for data=journal mode DIO could have never worked
> reasonably when mixed with buffered IO and it would have to do a different
> and much more expensive trickery (like flushing the journal, or at least
> forcing current transaction to commit).
>
You are right, thanks for pointing this out.
Will post an updated patch.
Mingming
On Tue, 2008-05-13 at 16:54 +0200, Jan Kara wrote:
> On Mon 12-05-08 17:39:43, Mingming Cao wrote:
> > On Mon, 2008-05-12 at 17:54 +0200, Jan Kara wrote:
> > Does this match what you are thinking? It certainly slow down the DIO
> > path, but the positive side is it doesn't disturb the other code path...
> > thanks for your feedback!
> >
> > --------------------------------------------
> >
> > An unexpected EIO error gets returned when writing to a file
> > using buffered writes and DIO writes at the same time.
> >
> > We found there are a number of places where journal_try_to_free_buffers()
> > could race with journal_commit_transaction(), the later still
> > helds the reference to the buffers on the t_syncdata_list or t_locked_list
> > , while journal_try_to_free_buffers() tries to free them, which resulting an EIO
> > error returns back to the dio caller.
> >
> > The logic fix is to retry freeing if journal_try_to_free_buffers() to failed
> > to free those data buffers while journal_commit_transaction() is still
> > reference those buffers.
> > This is done via implement ext3 launder_page() callback, instead of inside
> > journal_try_to_free_buffers() itself, so that it doesn't affecting other code
> > path calling journal_try_to_free_buffers and only dio path get affected.
> >
> > Signed-off-by: Mingming Cao <[email protected]>
> > Index: linux-2.6.26-rc1/fs/ext3/inode.c
> > ===================================================================
> > --- linux-2.6.26-rc1.orig/fs/ext3/inode.c 2008-05-03 11:59:44.000000000 -0700
> > +++ linux-2.6.26-rc1/fs/ext3/inode.c 2008-05-12 12:41:27.000000000 -0700
> > @@ -1766,6 +1766,23 @@ static int ext3_journalled_set_page_dirt
> > return __set_page_dirty_nobuffers(page);
> > }
> >
> > +static int ext3_launder_page(struct page *page)
> > +{
> > + int ret;
> > + int retry = 5;
> > +
> > + while (retry --) {
> > + ret = ext3_releasepage(page, GFP_KERNEL);
> > + if (ret == 1)
> > + break;
> > + else
> > + schedule();
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +
> Yes, I meant something like this. We could be more clever and do:
>
> head = bh = page_buffers(page);
> do {
> wait_on_buffer(bh);
> bh = bh->b_this_page;
> } while (bh != head);
> /*
> * Now commit code should have been able to proceed and release
> * those buffers
> */
> schedule();
>
Bummer, we can't free buffers in ext3_launder_page() before calling
try_to_free_page, as later
invalidate_complete_page2()->try_to_free_page() expecting the page
buffers are still here, and will return EIO if it launder_page() has
already freed those buffers.:(
Doing wait_on_buffer() alone in launder_page() is not enough as it
doesn't wait for buffer reference drop to 0.
> or we could do simple:
> log_wait_commit(...);
>
> That would impose larger perf. penalty but on the other hand you shouldn't
> hit this path too often.
Guess this is the last it worth give a try to see how bad it is, here is
a draft patch. Do you see any obvious problems?
Mingming
Signed-off-by: Mingming Cao <[email protected]>
fs/ext3/inode.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 68 insertions(+)
Index: linux-2.6.26-rc1/fs/ext3/inode.c
===================================================================
--- linux-2.6.26-rc1.orig/fs/ext3/inode.c 2008-05-13 13:35:27.000000000 -0700
+++ linux-2.6.26-rc1/fs/ext3/inode.c 2008-05-13 14:33:55.000000000 -0700
@@ -1766,6 +1766,53 @@ static int ext3_journalled_set_page_dirt
return __set_page_dirty_nobuffers(page);
}
+/*
+ * There are a number of places where journal_try_to_free_buffers()
+ * could race with journal_commit_transaction(), the later still
+ * helds the reference to the buffers on the t_syncdata_list or t_locked_list
+ * while journal_try_to_free_buffers() tries to free them,
+ * which resulting an EIO error returns back to the generic_file_direct_IO()
+ *
+ * It is also possible that mapped IO could re-dirty the page and buffers
+ * after direct IO has waited dirty pages have been flush to disk.
+ * When the journal_try_to_free_buffers() is called from the direct IO
+ * path, it may failed to free the buffers as the buffer may be locked again
+ *
+ * To fix this problem, we add a ext3_launder_page() callback which
+ * is only called on direct IO path, to wait for the buffer unlocked
+ * and waiting for the jbd finish commit those data buffers.
+ *
+ * This will impact the direct IO perf, but this alows direct IO and
+ * buffered IO could be performed concurrently.
+ *
+ * returns 0 in sucess case, and non zero in failure case.
+ */
+static int ext3_launder_page(struct page *page)
+{
+ int ret = 0;
+ struct buffer_head *head;
+ struct buffer_head *bh;
+ journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+
+ head = bh = page_buffers(page);
+ do {
+ /*
+ * it's possible mapped IO re-dirty and locked the
+ * buffer again when we came here
+ */
+ if (buffer_locked(bh)) {
+ get_bh(bh);
+ wait_on_buffer(bh);
+ put_bh(bh);
+ }
+
+ journal_wait_commit_buffer(journal, bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
+
+ return ret;
+}
+
static const struct address_space_operations ext3_ordered_aops = {
.readpage = ext3_readpage,
.readpages = ext3_readpages,
@@ -1778,6 +1825,7 @@ static const struct address_space_operat
.releasepage = ext3_releasepage,
.direct_IO = ext3_direct_IO,
.migratepage = buffer_migrate_page,
+ .launder_page = ext3_launder_page,
};
static const struct address_space_operations ext3_writeback_aops = {
Index: linux-2.6.26-rc1/fs/jbd/journal.c
===================================================================
--- linux-2.6.26-rc1.orig/fs/jbd/journal.c 2008-05-03 11:59:44.000000000 -0700
+++ linux-2.6.26-rc1/fs/jbd/journal.c 2008-05-13 14:43:47.000000000 -0700
@@ -74,6 +74,7 @@ EXPORT_SYMBOL(journal_errno);
EXPORT_SYMBOL(journal_ack_err);
EXPORT_SYMBOL(journal_clear_err);
EXPORT_SYMBOL(log_wait_commit);
+EXPORT_SYMBOL(journal_wait_commit_buffer);
EXPORT_SYMBOL(journal_start_commit);
EXPORT_SYMBOL(journal_force_commit_nested);
EXPORT_SYMBOL(journal_wipe);
@@ -558,6 +559,46 @@ int log_wait_commit(journal_t *journal,
}
/*
+ * Wait for a specific buffer has committed to the log
+ */
+int journal_wait_commit_buffer(journal_t *journal, struct buffer_head *bh)
+{
+ int ret = 0;
+ struct journal_head * jh;
+ transaction_t *t;
+ tid_t tid;
+
+ jbd_lock_bh_state(bh);
+ spin_lock(&journal->j_list_lock);
+ jh = journal_grab_journal_head(bh);
+
+ if (!jh)
+ goto unlock;
+
+ t = jh->b_transaction;
+
+ if (!t)
+ goto release;
+
+ tid = t->t_tid;
+
+ journal_put_journal_head(jh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+
+ log_start_commit(journal, tid);
+ ret = log_wait_commit(journal, tid);
+ return ret;
+
+release:
+ journal_put_journal_head(jh);
+unlock:
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ return ret;
+}
+
+/*
* Log buffer allocation routines:
*/
Index: linux-2.6.26-rc1/include/linux/jbd.h
===================================================================
--- linux-2.6.26-rc1.orig/include/linux/jbd.h 2008-05-03 11:59:44.000000000 -0700
+++ linux-2.6.26-rc1/include/linux/jbd.h 2008-05-13 14:42:45.000000000 -0700
@@ -974,6 +974,7 @@ int __log_start_commit(journal_t *journa
int journal_start_commit(journal_t *journal, tid_t *tid);
int journal_force_commit_nested(journal_t *journal);
int log_wait_commit(journal_t *journal, tid_t tid);
+int journal_wait_commit_buffer(journal_t *journal, struct buffer_head *bh);
int log_do_checkpoint(journal_t *journal);
void __log_wait_for_space(journal_t *journal);
On Tue 13-05-08 15:23:09, Mingming Cao wrote:
> On Tue, 2008-05-13 at 16:54 +0200, Jan Kara wrote:
> > On Mon 12-05-08 17:39:43, Mingming Cao wrote:
> > > On Mon, 2008-05-12 at 17:54 +0200, Jan Kara wrote:
> > > Does this match what you are thinking? It certainly slow down the DIO
> > > path, but the positive side is it doesn't disturb the other code path...
> > > thanks for your feedback!
> > >
> > > --------------------------------------------
> > >
> > > An unexpected EIO error gets returned when writing to a file
> > > using buffered writes and DIO writes at the same time.
> > >
> > > We found there are a number of places where journal_try_to_free_buffers()
> > > could race with journal_commit_transaction(), the later still
> > > helds the reference to the buffers on the t_syncdata_list or t_locked_list
> > > , while journal_try_to_free_buffers() tries to free them, which resulting an EIO
> > > error returns back to the dio caller.
> > >
> > > The logic fix is to retry freeing if journal_try_to_free_buffers() to failed
> > > to free those data buffers while journal_commit_transaction() is still
> > > reference those buffers.
> > > This is done via implement ext3 launder_page() callback, instead of inside
> > > journal_try_to_free_buffers() itself, so that it doesn't affecting other code
> > > path calling journal_try_to_free_buffers and only dio path get affected.
> > >
> > > Signed-off-by: Mingming Cao <[email protected]>
> > > Index: linux-2.6.26-rc1/fs/ext3/inode.c
> > > ===================================================================
> > > --- linux-2.6.26-rc1.orig/fs/ext3/inode.c 2008-05-03 11:59:44.000000000 -0700
> > > +++ linux-2.6.26-rc1/fs/ext3/inode.c 2008-05-12 12:41:27.000000000 -0700
> > > @@ -1766,6 +1766,23 @@ static int ext3_journalled_set_page_dirt
> > > return __set_page_dirty_nobuffers(page);
> > > }
> > >
> > > +static int ext3_launder_page(struct page *page)
> > > +{
> > > + int ret;
> > > + int retry = 5;
> > > +
> > > + while (retry --) {
> > > + ret = ext3_releasepage(page, GFP_KERNEL);
> > > + if (ret == 1)
> > > + break;
> > > + else
> > > + schedule();
> > > + }
> > > +
> > > + return ret;
> > > +}
> > > +
> > > +
> > Yes, I meant something like this. We could be more clever and do:
> >
> > head = bh = page_buffers(page);
> > do {
> > wait_on_buffer(bh);
> > bh = bh->b_this_page;
> > } while (bh != head);
> > /*
> > * Now commit code should have been able to proceed and release
> > * those buffers
> > */
> > schedule();
> >
>
> Bummer, we can't free buffers in ext3_launder_page() before calling
> try_to_free_page, as later
> invalidate_complete_page2()->try_to_free_page() expecting the page
> buffers are still here, and will return EIO if it launder_page() has
> already freed those buffers.:(
Are you sure? Because if bufferes are released in ext3_launder_page(),
PagePrivate() has been set to 0 and we should directly fall through to
releasing the page without ever calling try_to_release_page()... So I'd
want to find out why PagePrivate is still set in
invalidate_complete_page2().
> Doing wait_on_buffer() alone in launder_page() is not enough as it
> doesn't wait for buffer reference drop to 0.
Yes, this would not be enough.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed, 2008-05-14 at 19:08 +0200, Jan Kara wrote:
> On Tue 13-05-08 15:23:09, Mingming Cao wrote:
> > On Tue, 2008-05-13 at 16:54 +0200, Jan Kara wrote:
> > > On Mon 12-05-08 17:39:43, Mingming Cao wrote:
> > > > On Mon, 2008-05-12 at 17:54 +0200, Jan Kara wrote:
> > > > Does this match what you are thinking? It certainly slow down the DIO
> > > > path, but the positive side is it doesn't disturb the other code path...
> > > > thanks for your feedback!
> > > >
> > > > --------------------------------------------
> > > >
> > > > An unexpected EIO error gets returned when writing to a file
> > > > using buffered writes and DIO writes at the same time.
> > > >
> > > > We found there are a number of places where journal_try_to_free_buffers()
> > > > could race with journal_commit_transaction(), the later still
> > > > helds the reference to the buffers on the t_syncdata_list or t_locked_list
> > > > , while journal_try_to_free_buffers() tries to free them, which resulting an EIO
> > > > error returns back to the dio caller.
> > > >
> > > > The logic fix is to retry freeing if journal_try_to_free_buffers() to failed
> > > > to free those data buffers while journal_commit_transaction() is still
> > > > reference those buffers.
> > > > This is done via implement ext3 launder_page() callback, instead of inside
> > > > journal_try_to_free_buffers() itself, so that it doesn't affecting other code
> > > > path calling journal_try_to_free_buffers and only dio path get affected.
> > > >
> > > > Signed-off-by: Mingming Cao <[email protected]>
> > > > Index: linux-2.6.26-rc1/fs/ext3/inode.c
> > > > ===================================================================
> > > > --- linux-2.6.26-rc1.orig/fs/ext3/inode.c 2008-05-03 11:59:44.000000000 -0700
> > > > +++ linux-2.6.26-rc1/fs/ext3/inode.c 2008-05-12 12:41:27.000000000 -0700
> > > > @@ -1766,6 +1766,23 @@ static int ext3_journalled_set_page_dirt
> > > > return __set_page_dirty_nobuffers(page);
> > > > }
> > > >
> > > > +static int ext3_launder_page(struct page *page)
> > > > +{
> > > > + int ret;
> > > > + int retry = 5;
> > > > +
> > > > + while (retry --) {
> > > > + ret = ext3_releasepage(page, GFP_KERNEL);
> > > > + if (ret == 1)
> > > > + break;
> > > > + else
> > > > + schedule();
> > > > + }
> > > > +
> > > > + return ret;
> > > > +}
> > > > +
> > > > +
> > > Yes, I meant something like this. We could be more clever and do:
> > >
> > > head = bh = page_buffers(page);
> > > do {
> > > wait_on_buffer(bh);
> > > bh = bh->b_this_page;
> > > } while (bh != head);
> > > /*
> > > * Now commit code should have been able to proceed and release
> > > * those buffers
> > > */
> > > schedule();
> > >
> >
> > Bummer, we can't free buffers in ext3_launder_page() before calling
> > try_to_free_page, as later
> > invalidate_complete_page2()->try_to_free_page() expecting the page
> > buffers are still here, and will return EIO if it launder_page() has
> > already freed those buffers.:(
> Are you sure? Because if bufferes are released in ext3_launder_page(),
> PagePrivate() has been set to 0 and we should directly fall through to
> releasing the page without ever calling try_to_release_page()... So I'd
> want to find out why PagePrivate is still set in
> invalidate_complete_page2().
>
You are right. PagePrivate() is being set to 0 in drop_buffers().
The problem is do_launder_page() returns successfully if the page is not
dirty (our case), so ext3_launder_page() is not even get called. This
also explains why the log_wait_commit() approach doesn't work for me:(
Have to think other ways...could we pass some flag to
journal_try_to_free_buffers(), and ask journal_try_to_free_buffers()
wait for jbd commit to finish flushing the data, if the request is from
directo IO?
Mingming
On Wed 14-05-08 10:41:12, Mingming Cao wrote:
> > > Bummer, we can't free buffers in ext3_launder_page() before calling
> > > try_to_free_page, as later
> > > invalidate_complete_page2()->try_to_free_page() expecting the page
> > > buffers are still here, and will return EIO if it launder_page() has
> > > already freed those buffers.:(
> > Are you sure? Because if bufferes are released in ext3_launder_page(),
> > PagePrivate() has been set to 0 and we should directly fall through to
> > releasing the page without ever calling try_to_release_page()... So I'd
> > want to find out why PagePrivate is still set in
> > invalidate_complete_page2().
> >
>
> You are right. PagePrivate() is being set to 0 in drop_buffers().
>
> The problem is do_launder_page() returns successfully if the page is not
> dirty (our case), so ext3_launder_page() is not even get called. This
> also explains why the log_wait_commit() approach doesn't work for me:(
I didn't realize PageDirty() is going to be already cleared by previous
writes... :(
> Have to think other ways...could we pass some flag to
> journal_try_to_free_buffers(), and ask journal_try_to_free_buffers()
> wait for jbd commit to finish flushing the data, if the request is from
> directo IO?
Well, we could do that but we'd have to change try_to_release_page() call
to accept an extra argument which would consequently mean changing all the
filesystems... But yes, it probably makes sence because it is really
different whether we should just release the page because of memory
pressure or because direct IO needs to write to that area of the file.
So adding the parameter to releasepage() callback is probably a reasonable
thing to do.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed, 2008-05-14 at 20:14 +0200, Jan Kara wrote:
> On Wed 14-05-08 10:41:12, Mingming Cao wrote:
> > > > Bummer, we can't free buffers in ext3_launder_page() before calling
> > > > try_to_free_page, as later
> > > > invalidate_complete_page2()->try_to_free_page() expecting the page
> > > > buffers are still here, and will return EIO if it launder_page() has
> > > > already freed those buffers.:(
> > > Are you sure? Because if bufferes are released in ext3_launder_page(),
> > > PagePrivate() has been set to 0 and we should directly fall through to
> > > releasing the page without ever calling try_to_release_page()... So I'd
> > > want to find out why PagePrivate is still set in
> > > invalidate_complete_page2().
> > >
> >
> > You are right. PagePrivate() is being set to 0 in drop_buffers().
> >
> > The problem is do_launder_page() returns successfully if the page is not
> > dirty (our case), so ext3_launder_page() is not even get called. This
> > also explains why the log_wait_commit() approach doesn't work for me:(
> I didn't realize PageDirty() is going to be already cleared by previous
> writes... :(
>
> > Have to think other ways...could we pass some flag to
> > journal_try_to_free_buffers(), and ask journal_try_to_free_buffers()
> > wait for jbd commit to finish flushing the data, if the request is from
> > directo IO?
> Well, we could do that but we'd have to change try_to_release_page() call
> to accept an extra argument which would consequently mean changing all the
> filesystems...
Actually there is an argument gfp_mask passed to try_to_release_page()
which we could pass a special flag from direct IO that could be parsed
as direct IO request. This would avoid changing all the filesystems and
the address space operation interface. In fact, I don't see in-kernel
tree fs releasepage() cal back functions is using this gfp_mask, but
btrfs is using it.
> But yes, it probably makes sence because it is really
> different whether we should just release the page because of memory
> pressure or because direct IO needs to write to that area of the file.
> So adding the parameter to releasepage() callback is probably a reasonable
> thing to do.
>
Will send a patch shortly, with that patch the test fine for about 18
hours.
This patch fixed a few races between direct IO and kjournld commit transaction.
An unexpected EIO error gets returned to direct IO caller when it failed to
free those data buffers. This could be reproduced easily with parallel
direct write and buffered write to the same file
More specific, those races could cause journal_try_to_free_buffers()
fail to free the data buffers, when jbd is committing the transaction that has
those data buffers on its t_syncdata_list or t_locked_list.
journal_commit_transaction() still holds the reference to those buffers
before data reach to disk and buffers are removed from the
t_syncdata_list of t_locked_list. This prevent the concurrent
journal_try_to_free_buffers() to free those buffers at the same time, and cause
EIO error returns back to direct IO.
With this patch, in case of direct IO and when try_to_free_buffers() failed,
let's waiting for journal_commit_transaction() to finish
flushing the current committing transaction's data buffers to disk,
then try to free those buffers again.
Signed-off-by: Mingming Cao <[email protected]>
Reviewed-by: Badari Pulavarty <[email protected]>
---
fs/jbd/commit.c | 1 +
fs/jbd/journal.c | 1 +
fs/jbd/transaction.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
include/linux/jbd.h | 3 +++
4 files changed, 51 insertions(+)
Index: linux-2.6.26-rc1/include/linux/jbd.h
===================================================================
--- linux-2.6.26-rc1.orig/include/linux/jbd.h 2008-05-14 16:36:41.000000000 -0700
+++ linux-2.6.26-rc1/include/linux/jbd.h 2008-05-15 14:12:10.000000000 -0700
@@ -667,6 +667,9 @@ struct journal_s
*/
wait_queue_head_t j_wait_transaction_locked;
+ /* Wait queu for waiting for data buffers to flushed to disk*/
+ wait_queue_head_t j_wait_data_flushed;
+
/* Wait queue for waiting for checkpointing to complete */
wait_queue_head_t j_wait_logspace;
Index: linux-2.6.26-rc1/fs/jbd/commit.c
===================================================================
--- linux-2.6.26-rc1.orig/fs/jbd/commit.c 2008-05-03 11:59:44.000000000 -0700
+++ linux-2.6.26-rc1/fs/jbd/commit.c 2008-05-15 14:12:46.000000000 -0700
@@ -462,6 +462,7 @@ void journal_commit_transaction(journal_
* clean by now, so check that it is in fact empty.
*/
J_ASSERT (commit_transaction->t_sync_datalist == NULL);
+ wake_up(&journal->j_wait_data_flushed)
jbd_debug (3, "JBD: commit phase 3\n");
Index: linux-2.6.26-rc1/fs/jbd/journal.c
===================================================================
--- linux-2.6.26-rc1.orig/fs/jbd/journal.c 2008-05-14 16:36:41.000000000 -0700
+++ linux-2.6.26-rc1/fs/jbd/journal.c 2008-05-15 14:13:02.000000000 -0700
@@ -660,6 +660,7 @@ static journal_t * journal_init_common (
goto fail;
init_waitqueue_head(&journal->j_wait_transaction_locked);
+ init_waitqueue_head(&journal->j_wait_data_flushed);
init_waitqueue_head(&journal->j_wait_logspace);
init_waitqueue_head(&journal->j_wait_done_commit);
init_waitqueue_head(&journal->j_wait_checkpoint);
Index: linux-2.6.26-rc1/fs/jbd/transaction.c
===================================================================
--- linux-2.6.26-rc1.orig/fs/jbd/transaction.c 2008-05-03 11:59:44.000000000 -0700
+++ linux-2.6.26-rc1/fs/jbd/transaction.c 2008-05-16 06:42:30.000000000 -0700
@@ -1648,12 +1648,47 @@ out:
return;
}
+/*
+ * Journal_try_to_free_buffers() will call this function to w
+ * waiting for the current transaction finishing syncing data buffers, before
+ * try to free that buffer.
+ *
+ * journal_try_to_free_buffers() could race with journal_commit_transaction()
+ * The later might still hold the reference count to the buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ */
+static void journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction)
+ return;
+
+ /*
+ * If the current transaction is flushing and waiting for data buffers
+ * (t_state is T_FLUSH), wait for the j_wait_data_flushed event
+ */
+ if (transaction->t_state == T_FLUSH) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&journal->j_wait_data_flushed,
+ &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&journal->j_state_lock);
+ schedule();
+ finish_wait(&journal->j_wait_data_flushed, &wait);
+ spin_lock(&journal->j_state_lock);
+ }
+ return;
+}
/**
* int journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: unused for allocation purpose. Here is used
+ * as a flag to tell if direct IO is attemping to free buffers.
*
*
* For all the buffers on this page,
@@ -1682,13 +1717,16 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
int ret = 0;
+ int dio = gfp_mask & __GFP_REPEAT;
J_ASSERT(PageLocked(page));
@@ -1713,7 +1751,31 @@ int journal_try_to_free_buffers(journal_
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * In the case of concurrent direct IO and buffered IO,
+ * There are a number of places where we
+ * could race with journal_commit_transaction(), the later still
+ * helds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers,
+ * resulting in an unexpected EIO error
+ * returns back to the generic_file_direct_IO()
+ *
+ * So let's wait for the current transaction finished flush
+ * dirty data buffers before we try to free those buffers
+ * again. This wait is needed by direct IO code path only,
+ * gfp_mask __GFP_REPEAT is passed from the direct IO code
+ * path to flag if we need to wait and retry free buffers.
+ */
+ if (ret == 0 && dio) {
+ spin_lock(&journal->j_state_lock);
+ journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ spin_unlock(&journal->j_state_lock);
+ }
+
busy:
return ret;
}
Index: linux-2.6.26-rc1/mm/truncate.c
===================================================================
--- linux-2.6.26-rc1.orig/mm/truncate.c 2008-05-03 11:59:44.000000000 -0700
+++ linux-2.6.26-rc1/mm/truncate.c 2008-05-15 13:13:21.000000000 -0700
@@ -346,7 +346,8 @@ invalidate_complete_page2(struct address
if (page->mapping != mapping)
return 0;
- if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+ if (PagePrivate(page) &&
+ !try_to_release_page(page,GFP_KERNEL|__GFP_REPEAT))
return 0;
write_lock_irq(&mapping->tree_lock);
On Fri, May 16, 2008 at 07:14:10AM -0700, Mingming Cao wrote:
> This patch fixed a few races between direct IO and kjournld commit transaction.
> An unexpected EIO error gets returned to direct IO caller when it failed to
> free those data buffers. This could be reproduced easily with parallel
> direct write and buffered write to the same file
>
> More specific, those races could cause journal_try_to_free_buffers()
> fail to free the data buffers, when jbd is committing the transaction that has
> those data buffers on its t_syncdata_list or t_locked_list.
> journal_commit_transaction() still holds the reference to those buffers
> before data reach to disk and buffers are removed from the
> t_syncdata_list of t_locked_list. This prevent the concurrent
> journal_try_to_free_buffers() to free those buffers at the same time, and cause
> EIO error returns back to direct IO.
>
> With this patch, in case of direct IO and when try_to_free_buffers() failed,
> let's waiting for journal_commit_transaction() to finish
> flushing the current committing transaction's data buffers to disk,
> then try to free those buffers again.
>
> Signed-off-by: Mingming Cao <[email protected]>
> Reviewed-by: Badari Pulavarty <[email protected]>
> ---
> fs/jbd/commit.c | 1 +
> fs/jbd/journal.c | 1 +
> fs/jbd/transaction.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/jbd.h | 3 +++
> 4 files changed, 51 insertions(+)
>
> Index: linux-2.6.26-rc1/include/linux/jbd.h
> ===================================================================
> --- linux-2.6.26-rc1.orig/include/linux/jbd.h 2008-05-14 16:36:41.000000000 -0700
> +++ linux-2.6.26-rc1/include/linux/jbd.h 2008-05-15 14:12:10.000000000 -0700
> @@ -667,6 +667,9 @@ struct journal_s
> */
> wait_queue_head_t j_wait_transaction_locked;
>
> + /* Wait queu for waiting for data buffers to flushed to disk*/
> + wait_queue_head_t j_wait_data_flushed;
> +
> /* Wait queue for waiting for checkpointing to complete */
> wait_queue_head_t j_wait_logspace;
>
> Index: linux-2.6.26-rc1/fs/jbd/commit.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/fs/jbd/commit.c 2008-05-03 11:59:44.000000000 -0700
> +++ linux-2.6.26-rc1/fs/jbd/commit.c 2008-05-15 14:12:46.000000000 -0700
> @@ -462,6 +462,7 @@ void journal_commit_transaction(journal_
> * clean by now, so check that it is in fact empty.
> */
> J_ASSERT (commit_transaction->t_sync_datalist == NULL);
> + wake_up(&journal->j_wait_data_flushed)
>
> jbd_debug (3, "JBD: commit phase 3\n");
>
> Index: linux-2.6.26-rc1/fs/jbd/journal.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/fs/jbd/journal.c 2008-05-14 16:36:41.000000000 -0700
> +++ linux-2.6.26-rc1/fs/jbd/journal.c 2008-05-15 14:13:02.000000000 -0700
> @@ -660,6 +660,7 @@ static journal_t * journal_init_common (
> goto fail;
>
> init_waitqueue_head(&journal->j_wait_transaction_locked);
> + init_waitqueue_head(&journal->j_wait_data_flushed);
> init_waitqueue_head(&journal->j_wait_logspace);
> init_waitqueue_head(&journal->j_wait_done_commit);
> init_waitqueue_head(&journal->j_wait_checkpoint);
> Index: linux-2.6.26-rc1/fs/jbd/transaction.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/fs/jbd/transaction.c 2008-05-03 11:59:44.000000000 -0700
> +++ linux-2.6.26-rc1/fs/jbd/transaction.c 2008-05-16 06:42:30.000000000 -0700
> @@ -1648,12 +1648,47 @@ out:
> return;
> }
>
> +/*
> + * Journal_try_to_free_buffers() will call this function to w
> + * waiting for the current transaction finishing syncing data buffers, before
> + * try to free that buffer.
> + *
> + * journal_try_to_free_buffers() could race with journal_commit_transaction()
> + * The later might still hold the reference count to the buffers when inspecting
> + * them on t_syncdata_list or t_locked_list.
> + */
> +static void journal_wait_for_transaction_sync_data(journal_t *journal)
> +{
> + transaction_t *transaction = NULL;
> +
> + transaction = journal->j_committing_transaction;
> +
> + if (!transaction)
> + return;
> +
> + /*
> + * If the current transaction is flushing and waiting for data buffers
> + * (t_state is T_FLUSH), wait for the j_wait_data_flushed event
> + */
> + if (transaction->t_state == T_FLUSH) {
> + DEFINE_WAIT(wait);
> +
> + prepare_to_wait(&journal->j_wait_data_flushed,
> + &wait, TASK_UNINTERRUPTIBLE);
> + spin_unlock(&journal->j_state_lock);
> + schedule();
> + finish_wait(&journal->j_wait_data_flushed, &wait);
> + spin_lock(&journal->j_state_lock);
> + }
> + return;
> +}
>
Got a couple of whitespace problems above it looks like. Thanks,
Josef
On Fri, 2008-05-16 at 11:01 -0400, Josef Bacik wrote:
>
> Got a couple of whitespace problems above it looks like. Thanks,
>
Thanks for catching this, below is updated patch, fixed the whitespace
and comments.
---------------------------------------------------
JBD: fix journal_try_to_free_buffers race with
journal_commit_transaction
From: Mingming Cao <[email protected]>
This patch fixed a few races between direct IO and kjournld commit
transaction.
An unexpected EIO error gets returned to direct IO caller when it failed
to
free those data buffers. This could be reproduced easily with parallel
direct write and buffered write to the same file
More specificly, those races could cause journal_try_to_free_buffers()
fail to free the data buffers, when jbd is committing the transaction
that has
those data buffers on its t_syncdata_list or t_locked_list.
journal_commit_transaction() still holds the reference to those buffers
before data reach to disk and buffers are removed from the
t_syncdata_list of t_locked_list. This prevent the concurrent
journal_try_to_free_buffers() to free those buffers at the same time,
but cause
EIO error returns back to direct IO.
With this patch, in case of direct IO and when try_to_free_buffers()
failed,
let's waiting for journal_commit_transaction() to finish
flushing the current committing transaction's data buffers to disk,
then try to free those buffers again.
Signed-off-by: Mingming Cao <[email protected]>
Reviewed-by: Badari Pulavarty <[email protected]>
---
fs/jbd/commit.c | 1 +
fs/jbd/journal.c | 1 +
fs/jbd/transaction.c | 46
++++++++++++++++++++++++++++++++++++++++++++++
include/linux/jbd.h | 3 +++
4 files changed, 51 insertions(+)
Index: linux-2.6.26-rc1/include/linux/jbd.h
===================================================================
--- linux-2.6.26-rc1.orig/include/linux/jbd.h 2008-05-14
16:36:41.000000000 -0700
+++ linux-2.6.26-rc1/include/linux/jbd.h 2008-05-15 14:12:10.000000000
-0700
@@ -667,6 +667,9 @@ struct journal_s
*/
wait_queue_head_t j_wait_transaction_locked;
+ /* Wait queu for waiting for data buffers to flushed to disk*/
+ wait_queue_head_t j_wait_data_flushed;
+
/* Wait queue for waiting for checkpointing to complete */
wait_queue_head_t j_wait_logspace;
Index: linux-2.6.26-rc1/fs/jbd/commit.c
===================================================================
--- linux-2.6.26-rc1.orig/fs/jbd/commit.c 2008-05-03 11:59:44.000000000
-0700
+++ linux-2.6.26-rc1/fs/jbd/commit.c 2008-05-15 14:12:46.000000000 -0700
@@ -462,6 +462,7 @@ void journal_commit_transaction(journal_
* clean by now, so check that it is in fact empty.
*/
J_ASSERT (commit_transaction->t_sync_datalist == NULL);
+ wake_up(&journal->j_wait_data_flushed)
jbd_debug (3, "JBD: commit phase 3\n");
Index: linux-2.6.26-rc1/fs/jbd/journal.c
===================================================================
--- linux-2.6.26-rc1.orig/fs/jbd/journal.c 2008-05-14 16:36:41.000000000
-0700
+++ linux-2.6.26-rc1/fs/jbd/journal.c 2008-05-15 14:13:02.000000000
-0700
@@ -660,6 +660,7 @@ static journal_t * journal_init_common (
goto fail;
init_waitqueue_head(&journal->j_wait_transaction_locked);
+ init_waitqueue_head(&journal->j_wait_data_flushed);
init_waitqueue_head(&journal->j_wait_logspace);
init_waitqueue_head(&journal->j_wait_done_commit);
init_waitqueue_head(&journal->j_wait_checkpoint);
Index: linux-2.6.26-rc1/fs/jbd/transaction.c
===================================================================
--- linux-2.6.26-rc1.orig/fs/jbd/transaction.c 2008-05-03
11:59:44.000000000 -0700
+++ linux-2.6.26-rc1/fs/jbd/transaction.c 2008-05-16 09:27:21.000000000
-0700
@@ -1648,12 +1648,49 @@ out:
return;
}
+/*
+ * journal_try_to_free_buffers() could race with
journal_commit_transaction()
+ * The later might still hold the reference count to the buffers when
inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * Journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction finishing syncing data buffers,
before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction)
+ return;
+
+ /*
+ * If the current transaction is flushing and waiting for data buffers
+ * (t_state is T_FLUSH), wait for the j_wait_data_flushed event
+ */
+ if (transaction->t_state == T_FLUSH) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&journal->j_wait_data_flushed,
+ &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&journal->j_state_lock);
+ schedule();
+ finish_wait(&journal->j_wait_data_flushed, &wait);
+ spin_lock(&journal->j_state_lock);
+ }
+ return;
+}
/**
* int journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: unused for allocation purpose. Here is used
+ * as a flag to tell if direct IO is attemping to free buffers.
*
*
* For all the buffers on this page,
@@ -1682,13 +1719,16 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
int ret = 0;
+ int dio = gfp_mask & __GFP_REPEAT;
J_ASSERT(PageLocked(page));
@@ -1713,7 +1753,31 @@ int journal_try_to_free_buffers(journal_
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * In the case of concurrent direct IO and buffered IO,
+ * There are a number of places where we
+ * could race with journal_commit_transaction(), the later still
+ * helds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers,
+ * resulting in an unexpected EIO error
+ * returns back to the generic_file_direct_IO()
+ *
+ * So let's wait for the current transaction finished flush
+ * dirty data buffers before we try to free those buffers
+ * again. This wait is needed by direct IO code path only,
+ * gfp_mask __GFP_REPEAT is passed from the direct IO code
+ * path to flag if we need to wait and retry free buffers.
+ */
+ if (ret == 0 && dio) {
+ spin_lock(&journal->j_state_lock);
+ journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ spin_unlock(&journal->j_state_lock);
+ }
+
busy:
return ret;
}
Index: linux-2.6.26-rc1/mm/truncate.c
===================================================================
--- linux-2.6.26-rc1.orig/mm/truncate.c 2008-05-03 11:59:44.000000000
-0700
+++ linux-2.6.26-rc1/mm/truncate.c 2008-05-15 13:13:21.000000000 -0700
@@ -346,7 +346,8 @@ invalidate_complete_page2(struct address
if (page->mapping != mapping)
return 0;
- if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+ if (PagePrivate(page) &&
+ !try_to_release_page(page,GFP_KERNEL|__GFP_REPEAT))
return 0;
write_lock_irq(&mapping->tree_lock);
> Josef
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Small nits..
On Fri, 2008-05-16 at 07:14 -0700, Mingming Cao wrote:
> This patch fixed a few races between direct IO and kjournld commit transaction.
> An unexpected EIO error gets returned to direct IO caller when it failed to
> free those data buffers. This could be reproduced easily with parallel
> direct write and buffered write to the same file
>
> More specific, those races could cause journal_try_to_free_buffers()
> fail to free the data buffers, when jbd is committing the transaction that has
> those data buffers on its t_syncdata_list or t_locked_list.
> journal_commit_transaction() still holds the reference to those buffers
> before data reach to disk and buffers are removed from the
> t_syncdata_list of t_locked_list. This prevent the concurrent
> journal_try_to_free_buffers() to free those buffers at the same time, and cause
> EIO error returns back to direct IO.
>
> With this patch, in case of direct IO and when try_to_free_buffers() failed,
> let's waiting for journal_commit_transaction() to finish
> flushing the current committing transaction's data buffers to disk,
> then try to free those buffers again.
>
> Signed-off-by: Mingming Cao <[email protected]>
> Reviewed-by: Badari Pulavarty <[email protected]>
> ---
> fs/jbd/commit.c | 1 +
> fs/jbd/journal.c | 1 +
> fs/jbd/transaction.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/jbd.h | 3 +++
> 4 files changed, 51 insertions(+)
>
> Index: linux-2.6.26-rc1/include/linux/jbd.h
> ===================================================================
> --- linux-2.6.26-rc1.orig/include/linux/jbd.h 2008-05-14 16:36:41.000000000 -0700
> +++ linux-2.6.26-rc1/include/linux/jbd.h 2008-05-15 14:12:10.000000000 -0700
> @@ -667,6 +667,9 @@ struct journal_s
> */
> wait_queue_head_t j_wait_transaction_locked;
>
> + /* Wait queu for waiting for data buffers to flushed to disk*/
Fix typo - "queue" and also this line is wrapping. Could you spilt it
into 2 lines ?
> + wait_queue_head_t j_wait_data_flushed;
> +
> /* Wait queue for waiting for checkpointing to complete */
> wait_queue_head_t j_wait_logspace;
>
> Index: linux-2.6.26-rc1/fs/jbd/commit.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/fs/jbd/commit.c 2008-05-03 11:59:44.000000000 -0700
> +++ linux-2.6.26-rc1/fs/jbd/commit.c 2008-05-15 14:12:46.000000000 -0700
> @@ -462,6 +462,7 @@ void journal_commit_transaction(journal_
> * clean by now, so check that it is in fact empty.
> */
> J_ASSERT (commit_transaction->t_sync_datalist == NULL);
> + wake_up(&journal->j_wait_data_flushed)
missing ";" ? Did it compile ? :)
> jbd_debug (3, "JBD: commit phase 3\n");
>
> Index: linux-2.6.26-rc1/fs/jbd/journal.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/fs/jbd/journal.c 2008-05-14 16:36:41.000000000 -0700
> +++ linux-2.6.26-rc1/fs/jbd/journal.c 2008-05-15 14:13:02.000000000 -0700
> @@ -660,6 +660,7 @@ static journal_t * journal_init_common (
> goto fail;
>
> init_waitqueue_head(&journal->j_wait_transaction_locked);
> + init_waitqueue_head(&journal->j_wait_data_flushed);
> init_waitqueue_head(&journal->j_wait_logspace);
> init_waitqueue_head(&journal->j_wait_done_commit);
> init_waitqueue_head(&journal->j_wait_checkpoint);
> Index: linux-2.6.26-rc1/fs/jbd/transaction.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/fs/jbd/transaction.c 2008-05-03 11:59:44.000000000 -0700
> +++ linux-2.6.26-rc1/fs/jbd/transaction.c 2008-05-16 06:42:30.000000000 -0700
> @@ -1648,12 +1648,47 @@ out:
> return;
> }
>
> +/*
> + * Journal_try_to_free_buffers() will call this function to w
drop "w"
> + * waiting for the current transaction finishing syncing data buffers, before
fix grammar - "wait for the current transaction to finish syncing data
buffers, .."
> + * try to free that buffer.
> + *
> + * journal_try_to_free_buffers() could race with journal_commit_transaction()
> + * The later might still hold the reference count to the buffers when inspecting
> + * them on t_syncdata_list or t_locked_list.
> + */
> +static void journal_wait_for_transaction_sync_data(journal_t *journal)
> +{
> + transaction_t *transaction = NULL;
> +
> + transaction = journal->j_committing_transaction;
> +
> + if (!transaction)
> + return;
> +
> + /*
> + * If the current transaction is flushing and waiting for data buffers
> + * (t_state is T_FLUSH), wait for the j_wait_data_flushed event
> + */
use "tabs" instead of "whitespaces"
> + if (transaction->t_state == T_FLUSH) {
> + DEFINE_WAIT(wait);
> +
> + prepare_to_wait(&journal->j_wait_data_flushed,
> + &wait, TASK_UNINTERRUPTIBLE);
> + spin_unlock(&journal->j_state_lock);
> + schedule();
> + finish_wait(&journal->j_wait_data_flushed, &wait);
> + spin_lock(&journal->j_state_lock);
again "whitespace" issues for the entire function..
> + }
> + return;
> +}
>
> /**
> * int journal_try_to_free_buffers() - try to free page buffers.
> * @journal: journal for operation
> * @page: to try and free
> - * @unused_gfp_mask: unused
> + * @gfp_mask: unused for allocation purpose. Here is used
> + * as a flag to tell if direct IO is attemping to free buffers.
> *
> *
> * For all the buffers on this page,
> @@ -1682,13 +1717,16 @@ out:
> * journal_try_to_free_buffer() is changing its state. But that
> * cannot happen because we never reallocate freed data as metadata
> * while the data is part of a transaction. Yes?
> + *
> + * Return 0 on failure, 1 on success
> */
> int journal_try_to_free_buffers(journal_t *journal,
> - struct page *page, gfp_t unused_gfp_mask)
> + struct page *page, gfp_t gfp_mask)
> {
> struct buffer_head *head;
> struct buffer_head *bh;
> int ret = 0;
> + int dio = gfp_mask & __GFP_REPEAT;
>
> J_ASSERT(PageLocked(page));
>
> @@ -1713,7 +1751,31 @@ int journal_try_to_free_buffers(journal_
> if (buffer_jbd(bh))
> goto busy;
> } while ((bh = bh->b_this_page) != head);
> +
> ret = try_to_free_buffers(page);
> +
> + /*
> + * In the case of concurrent direct IO and buffered IO,
> + * There are a number of places where we
> + * could race with journal_commit_transaction(), the later still
> + * helds the reference to the buffers to free while processing them.
"holds the reference to the buffers while processing them".
> + * try_to_free_buffers() failed to free those buffers,
> + * resulting in an unexpected EIO error
> + * returns back to the generic_file_direct_IO()
> + *
> + * So let's wait for the current transaction finished flush
"wait for the commited transaction to finish the flush of dirty data"
> + * dirty data buffers before we try to free those buffers
> + * again. This wait is needed by direct IO code path only,
> + * gfp_mask __GFP_REPEAT is passed from the direct IO code
> + * path to flag if we need to wait and retry free buffers.
> + */
> + if (ret == 0 && dio) {
> + spin_lock(&journal->j_state_lock);
> + journal_wait_for_transaction_sync_data(journal);
> + ret = try_to_free_buffers(page);
> + spin_unlock(&journal->j_state_lock);
> + }
> +
> busy:
> return ret;
> }
> Index: linux-2.6.26-rc1/mm/truncate.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/mm/truncate.c 2008-05-03 11:59:44.000000000 -0700
> +++ linux-2.6.26-rc1/mm/truncate.c 2008-05-15 13:13:21.000000000 -0700
> @@ -346,7 +346,8 @@ invalidate_complete_page2(struct address
> if (page->mapping != mapping)
> return 0;
>
> - if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
> + if (PagePrivate(page) &&
> + !try_to_release_page(page,GFP_KERNEL|__GFP_REPEAT))
> return 0;
>
> write_lock_irq(&mapping->tree_lock);
>
>
On Fri, 2008-05-16 at 10:11 -0700, Mingming Cao wrote:
> On Fri, 2008-05-16 at 11:01 -0400, Josef Bacik wrote:
>
> >
> > Got a couple of whitespace problems above it looks like. Thanks,
> >
>
> Thanks for catching this, below is updated patch, fixed the whitespace
> and comments.
>
>
> ---------------------------------------------------
> JBD: fix journal_try_to_free_buffers race with
> journal_commit_transaction
>
> From: Mingming Cao <[email protected]>
>
> This patch fixed a few races between direct IO and kjournld commit
> transaction.
> An unexpected EIO error gets returned to direct IO caller when it failed
> to
> free those data buffers. This could be reproduced easily with parallel
> direct write and buffered write to the same file
>
> More specificly, those races could cause journal_try_to_free_buffers()
> fail to free the data buffers, when jbd is committing the transaction
> that has
> those data buffers on its t_syncdata_list or t_locked_list.
> journal_commit_transaction() still holds the reference to those buffers
> before data reach to disk and buffers are removed from the
> t_syncdata_list of t_locked_list. This prevent the concurrent
> journal_try_to_free_buffers() to free those buffers at the same time,
> but cause
> EIO error returns back to direct IO.
>
> With this patch, in case of direct IO and when try_to_free_buffers()
> failed,
> let's waiting for journal_commit_transaction() to finish
> flushing the current committing transaction's data buffers to disk,
> then try to free those buffers again.
>
> Signed-off-by: Mingming Cao <[email protected]>
> Reviewed-by: Badari Pulavarty <[email protected]>
> ---
> fs/jbd/commit.c | 1 +
> fs/jbd/journal.c | 1 +
> fs/jbd/transaction.c | 46
> ++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/jbd.h | 3 +++
> 4 files changed, 51 insertions(+)
>
> Index: linux-2.6.26-rc1/include/linux/jbd.h
> ===================================================================
> --- linux-2.6.26-rc1.orig/include/linux/jbd.h 2008-05-14
> 16:36:41.000000000 -0700
> +++ linux-2.6.26-rc1/include/linux/jbd.h 2008-05-15 14:12:10.000000000
> -0700
> @@ -667,6 +667,9 @@ struct journal_s
> */
> wait_queue_head_t j_wait_transaction_locked;
>
> + /* Wait queu for waiting for data buffers to flushed to disk*/
> + wait_queue_head_t j_wait_data_flushed;
> +
> /* Wait queue for waiting for checkpointing to complete */
> wait_queue_head_t j_wait_logspace;
>
> Index: linux-2.6.26-rc1/fs/jbd/commit.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/fs/jbd/commit.c 2008-05-03 11:59:44.000000000
> -0700
> +++ linux-2.6.26-rc1/fs/jbd/commit.c 2008-05-15 14:12:46.000000000 -0700
> @@ -462,6 +462,7 @@ void journal_commit_transaction(journal_
> * clean by now, so check that it is in fact empty.
> */
> J_ASSERT (commit_transaction->t_sync_datalist == NULL);
> + wake_up(&journal->j_wait_data_flushed)
>
> jbd_debug (3, "JBD: commit phase 3\n");
>
> Index: linux-2.6.26-rc1/fs/jbd/journal.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/fs/jbd/journal.c 2008-05-14 16:36:41.000000000
> -0700
> +++ linux-2.6.26-rc1/fs/jbd/journal.c 2008-05-15 14:13:02.000000000
> -0700
> @@ -660,6 +660,7 @@ static journal_t * journal_init_common (
> goto fail;
>
> init_waitqueue_head(&journal->j_wait_transaction_locked);
> + init_waitqueue_head(&journal->j_wait_data_flushed);
> init_waitqueue_head(&journal->j_wait_logspace);
> init_waitqueue_head(&journal->j_wait_done_commit);
> init_waitqueue_head(&journal->j_wait_checkpoint);
> Index: linux-2.6.26-rc1/fs/jbd/transaction.c
> ===================================================================
> --- linux-2.6.26-rc1.orig/fs/jbd/transaction.c 2008-05-03
> 11:59:44.000000000 -0700
> +++ linux-2.6.26-rc1/fs/jbd/transaction.c 2008-05-16 09:27:21.000000000
> -0700
> @@ -1648,12 +1648,49 @@ out:
> return;
> }
>
> +/*
> + * journal_try_to_free_buffers() could race with
> journal_commit_transaction()
> + * The later might still hold the reference count to the buffers when
> inspecting
> + * them on t_syncdata_list or t_locked_list.
> + *
> + * Journal_try_to_free_buffers() will call this function to
> + * wait for the current transaction finishing syncing data buffers,
> before
> + * try to free that buffer.
> + *
> + * Called with journal->j_state_lock hold.
Fix wrapping lines ?
> + */
> +static void journal_wait_for_transaction_sync_data(journal_t *journal)
> +{
> + transaction_t *transaction = NULL;
> +
> + transaction = journal->j_committing_transaction;
> +
> + if (!transaction)
> + return;
> +
> + /*
> + * If the current transaction is flushing and waiting for data buffers
> + * (t_state is T_FLUSH), wait for the j_wait_data_flushed event
> + */
> + if (transaction->t_state == T_FLUSH) {
> + DEFINE_WAIT(wait);
> +
> + prepare_to_wait(&journal->j_wait_data_flushed,
> + &wait, TASK_UNINTERRUPTIBLE);
> + spin_unlock(&journal->j_state_lock);
> + schedule();
> + finish_wait(&journal->j_wait_data_flushed, &wait);
> + spin_lock(&journal->j_state_lock);
> + }
> + return;
> +}
>
> /**
> * int journal_try_to_free_buffers() - try to free page buffers.
> * @journal: journal for operation
> * @page: to try and free
> - * @unused_gfp_mask: unused
> + * @gfp_mask: unused for allocation purpose. Here is used
> + * as a flag to tell if direct IO is attemping to free buffers.
> *
> *
> * For all the buffers on this page,
> @@ -1682,13 +1719,16 @@ out:
> * journal_try_to_free_buffer() is changing its state. But that
> * cannot happen because we never reallocate freed data as metadata
> * while the data is part of a transaction. Yes?
> + *
> + * Return 0 on failure, 1 on success
> */
> int journal_try_to_free_buffers(journal_t *journal,
> - struct page *page, gfp_t unused_gfp_mask)
> + struct page *page, gfp_t gfp_mask)
> {
> struct buffer_head *head;
> struct buffer_head *bh;
> int ret = 0;
> + int dio = gfp_mask & __GFP_REPEAT;
>
> J_ASSERT(PageLocked(page));
>
> @@ -1713,7 +1753,31 @@ int journal_try_to_free_buffers(journal_
> if (buffer_jbd(bh))
> goto busy;
> } while ((bh = bh->b_this_page) != head);
> +
> ret = try_to_free_buffers(page);
> +
> + /*
> + * In the case of concurrent direct IO and buffered IO,
> + * There are a number of places where we
> + * could race with journal_commit_transaction(), the later still
> + * helds the reference to the buffers to free while processing them.
> + * try_to_free_buffers() failed to free those buffers,
> + * resulting in an unexpected EIO error
> + * returns back to the generic_file_direct_IO()
> + *
> + * So let's wait for the current transaction finished flush
> + * dirty data buffers before we try to free those buffers
> + * again. This wait is needed by direct IO code path only,
> + * gfp_mask __GFP_REPEAT is passed from the direct IO code
> + * path to flag if we need to wait and retry free buffers.
> + */
> + if (ret == 0 && dio) {
drop "dio" variable and compare here, like
if (ret == 0 && (gfp_mask & __GFP_REPEAT)
> + spin_lock(&journal->j_state_lock);
> + journal_wait_for_transaction_sync_data(journal);
> + ret = try_to_free_buffers(page);
> + spin_unlock(&journal->j_state_lock);
> + }
Thanks,
Badari
On Fri, 2008-05-16 at 10:17 -0700, Badari Pulavarty wrote:
> > @@ -1713,7 +1753,31 @@ int journal_try_to_free_buffers(journal_
> > if (buffer_jbd(bh))
> > goto busy;
> > } while ((bh = bh->b_this_page) != head);
> > +
> > ret = try_to_free_buffers(page);
> > +
> > + /*
> > + * In the case of concurrent direct IO and buffered IO,
> > + * There are a number of places where we
> > + * could race with journal_commit_transaction(), the later still
> > + * helds the reference to the buffers to free while processing them.
> > + * try_to_free_buffers() failed to free those buffers,
> > + * resulting in an unexpected EIO error
> > + * returns back to the generic_file_direct_IO()
> > + *
> > + * So let's wait for the current transaction finished flush
> > + * dirty data buffers before we try to free those buffers
> > + * again. This wait is needed by direct IO code path only,
> > + * gfp_mask __GFP_REPEAT is passed from the direct IO code
> > + * path to flag if we need to wait and retry free buffers.
> > + */
> > + if (ret == 0 && dio) {
>
> drop "dio" variable and compare here, like
> if (ret == 0 && (gfp_mask & __GFP_REPEAT)
>
Okay, will do. Also will update the patch with other format changes you
suggested.
> > + spin_lock(&journal->j_state_lock);
> > + journal_wait_for_transaction_sync_data(journal);
> > + ret = try_to_free_buffers(page);
> > + spin_unlock(&journal->j_state_lock);
> > + }
>
Also, This patch changed the struct journal_s to add a new wait queue,
so that journal_try_to_free_buffers() could only need to wait for data
to be commited, rather than using j_wait_done_commit queue and wait for
the whole transaction being committed. It might break other fs that
uses journal_s, thus not worth it. Will update the patch.
This patch fixed a few races between direct IO and kjournald commit transaction.
An unexpected EIO error gets returned to direct IO caller when it failed to
free those data buffers. This could be reproduced easily with parallel
direct write and buffered write to the same file
More specific, those races could cause journal_try_to_free_buffers()
fail to free the data buffers, when jbd is committing the transaction that has
those data buffers on its t_syncdata_list or t_locked_list.
journal_commit_transaction() still holds the reference to those buffers
before data reach to disk and buffers are removed from the
t_syncdata_list of t_locked_list. This prevent the concurrent
journal_try_to_free_buffers() to free those buffers at the same time, but cause
EIO error returns back to direct IO.
With this patch, in case of direct IO and when try_to_free_buffers() failed,
let's waiting for journal_commit_transaction() to finish
flushing the current committing transaction's data buffers to disk,
then try to free those buffers again.
Signed-off-by: Mingming Cao <[email protected]>
Reviewed-by: Badari Pulavarty <[email protected]>
---
fs/jbd/transaction.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++--
mm/truncate.c | 3 +-
2 files changed, 57 insertions(+), 3 deletions(-)
Index: linux-2.6.26-rc2/fs/jbd/transaction.c
===================================================================
--- linux-2.6.26-rc2.orig/fs/jbd/transaction.c 2008-05-16 11:51:02.000000000 -0700
+++ linux-2.6.26-rc2/fs/jbd/transaction.c 2008-05-16 13:43:02.000000000 -0700
@@ -1648,12 +1648,39 @@ out:
return;
}
+/*
+ * journal_try_to_free_buffers() could race with journal_commit_transaction()
+ * The later might still hold the reference count to the buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * Journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+ tid_t tid;
+
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction)
+ return;
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ log_wait_commit(journal, tid);
+ spin_lock(&journal->j_state_lock);
+}
/**
* int journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: unused for allocation purpose. Here is used
+ * as a flag to tell if direct IO is attemping to free buffers.
*
*
* For all the buffers on this page,
@@ -1682,9 +1709,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1713,7 +1742,31 @@ int journal_try_to_free_buffers(journal_
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * In the case of concurrent direct IO and buffered IO,
+ * There are a number of places where we
+ * could race with journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers,
+ * resulting in an unexpected EIO error
+ * returns back to the generic_file_direct_IO()
+ *
+ * So let's wait for the current transaction to finish flush of
+ * dirty data buffers before we try to free those buffers
+ * again. This wait is needed by direct IO code path only,
+ * gfp_mask __GFP_REPEAT is passed from the direct IO code
+ * path to flag if we need to wait and retry free buffers.
+ */
+ if (ret == 0 && gfp_mask & __GFP_REPEAT) {
+ spin_lock(&journal->j_state_lock);
+ journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ spin_unlock(&journal->j_state_lock);
+ }
+
busy:
return ret;
}
Index: linux-2.6.26-rc2/mm/truncate.c
===================================================================
--- linux-2.6.26-rc2.orig/mm/truncate.c 2008-05-16 11:51:02.000000000 -0700
+++ linux-2.6.26-rc2/mm/truncate.c 2008-05-16 13:42:18.000000000 -0700
@@ -346,7 +346,8 @@ invalidate_complete_page2(struct address
if (page->mapping != mapping)
return 0;
- if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+ if (PagePrivate(page) &&
+ !try_to_release_page(page,GFP_KERNEL|__GFP_REPEAT))
return 0;
write_lock_irq(&mapping->tree_lock);
JBD2: fix DIO error caused by race with DIO free_buffers and jbd2 commit transaction
From: Mingming Cao <[email protected]>
This patch fixed a few races between direct IO and kjournlad commit transaction.
An unexpected EIO error gets returned to direct IO caller when it failed to
free those data buffers. This could be reproduced easily with parallel
direct write and buffered write to the same file
More specific, those races could cause jbd2_journal_try_to_free_buffers()
fail to free the data buffers, when jbd is committing the transaction that has
those data buffers on its t_syncdata_list or t_locked_list.
jbd2_journal_commit_transaction() still holds the reference to those buffers
before data reach to disk and buffers are removed from the
t_syncdata_list of t_locked_list. This prevent the concurrent
jbd2_journal_try_to_free_buffers() to free those buffers at the same time, but cause
EIO error returns back to direct IO.
With this patch, in case of direct IO and when try_to_free_buffers() failed,
let's waiting for jbd2_journal_commit_transaction() to finish
flushing the current committing transaction's data buffers to disk,
then try to free those buffers again.
Signed-off-by: Mingming Cao <[email protected]>
Reviewed-by: Badari Pulavarty <[email protected]>
---
fs/jbd2/transaction.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 55 insertions(+), 2 deletions(-)
Index: linux-2.6.26-rc2/fs/jbd2/transaction.c
===================================================================
--- linux-2.6.26-rc2.orig/fs/jbd2/transaction.c 2008-05-16 11:16:56.000000000 -0700
+++ linux-2.6.26-rc2/fs/jbd2/transaction.c 2008-05-16 13:52:04.000000000 -0700
@@ -1656,12 +1656,39 @@ out:
return;
}
+/*
+ * jbd2_journal_try_to_free_buffers() could race with jbd2_journal_commit_transaction()
+ * The later might still hold the reference count to the buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+ tid_t tid;
+
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction)
+ return;
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+ spin_lock(&journal->j_state_lock);
+}
/**
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: unused for allocation purpose. Here is used
+ * as a flag to tell if direct IO is attemping to free buffers.
*
*
* For all the buffers on this page,
@@ -1690,9 +1717,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1721,7 +1750,31 @@ int jbd2_journal_try_to_free_buffers(jou
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * In the case of concurrent direct IO and buffered IO,
+ * There are a number of places where we
+ * could race with jbd2_journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers,
+ * resulting in an unexpected EIO error
+ * returns back to the generic_file_direct_IO()
+ *
+ * So let's wait for the current transaction to finish flush of
+ * dirty data buffers before we try to free those buffers
+ * again. This wait is needed by direct IO code path only,
+ * gfp_mask __GFP_REPEAT is passed from the direct IO code
+ * path to flag if we need to wait and retry free buffers.
+ */
+ if (ret == 0 && gfp_mask & __GFP_REPEAT) {
+ spin_lock(&journal->j_state_lock);
+ jbd2_journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ spin_unlock(&journal->j_state_lock);
+ }
+
busy:
return ret;
}
Hi,
> This patch fixed a few races between direct IO and kjournald commit
> transaction. An unexpected EIO error gets returned to direct IO
> caller when it failed to free those data buffers. This could be
> reproduced easily with parallel direct write and buffered write to the
> same file
>
> More specific, those races could cause journal_try_to_free_buffers()
> fail to free the data buffers, when jbd is committing the transaction
> that has those data buffers on its t_syncdata_list or t_locked_list.
> journal_commit_transaction() still holds the reference to those
> buffers before data reach to disk and buffers are removed from the
> t_syncdata_list of t_locked_list. This prevent the concurrent
> journal_try_to_free_buffers() to free those buffers at the same time,
> but cause EIO error returns back to direct IO.
>
> With this patch, in case of direct IO and when try_to_free_buffers() failed,
> let's waiting for journal_commit_transaction() to finish
> flushing the current committing transaction's data buffers to disk,
> then try to free those buffers again.
If Andrew or Christoph wouldn't beat you for "inventive use" of
gfp_mask, I'm fine with the patch as well ;). You can add
Acked-by: Jan Kara <[email protected]>
>
> Signed-off-by: Mingming Cao <[email protected]>
> Reviewed-by: Badari Pulavarty <[email protected]>
> ---
> fs/jbd/transaction.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++--
> mm/truncate.c | 3 +-
> 2 files changed, 57 insertions(+), 3 deletions(-)
>
> Index: linux-2.6.26-rc2/fs/jbd/transaction.c
> ===================================================================
> --- linux-2.6.26-rc2.orig/fs/jbd/transaction.c 2008-05-16 11:51:02.000000000 -0700
> +++ linux-2.6.26-rc2/fs/jbd/transaction.c 2008-05-16 13:43:02.000000000 -0700
> @@ -1648,12 +1648,39 @@ out:
> return;
> }
>
> +/*
> + * journal_try_to_free_buffers() could race with journal_commit_transaction()
> + * The later might still hold the reference count to the buffers when inspecting
> + * them on t_syncdata_list or t_locked_list.
> + *
> + * Journal_try_to_free_buffers() will call this function to
> + * wait for the current transaction to finish syncing data buffers, before
> + * try to free that buffer.
> + *
> + * Called with journal->j_state_lock hold.
> + */
> +static void journal_wait_for_transaction_sync_data(journal_t *journal)
> +{
> + transaction_t *transaction = NULL;
> + tid_t tid;
> +
> + transaction = journal->j_committing_transaction;
> +
> + if (!transaction)
> + return;
> +
> + tid = transaction->t_tid;
> + spin_unlock(&journal->j_state_lock);
> + log_wait_commit(journal, tid);
> + spin_lock(&journal->j_state_lock);
> +}
>
> /**
> * int journal_try_to_free_buffers() - try to free page buffers.
> * @journal: journal for operation
> * @page: to try and free
> - * @unused_gfp_mask: unused
> + * @gfp_mask: unused for allocation purpose. Here is used
> + * as a flag to tell if direct IO is attemping to free buffers.
> *
> *
> * For all the buffers on this page,
> @@ -1682,9 +1709,11 @@ out:
> * journal_try_to_free_buffer() is changing its state. But that
> * cannot happen because we never reallocate freed data as metadata
> * while the data is part of a transaction. Yes?
> + *
> + * Return 0 on failure, 1 on success
> */
> int journal_try_to_free_buffers(journal_t *journal,
> - struct page *page, gfp_t unused_gfp_mask)
> + struct page *page, gfp_t gfp_mask)
> {
> struct buffer_head *head;
> struct buffer_head *bh;
> @@ -1713,7 +1742,31 @@ int journal_try_to_free_buffers(journal_
> if (buffer_jbd(bh))
> goto busy;
> } while ((bh = bh->b_this_page) != head);
> +
> ret = try_to_free_buffers(page);
> +
> + /*
> + * In the case of concurrent direct IO and buffered IO,
> + * There are a number of places where we
> + * could race with journal_commit_transaction(), the later still
> + * holds the reference to the buffers to free while processing them.
> + * try_to_free_buffers() failed to free those buffers,
> + * resulting in an unexpected EIO error
> + * returns back to the generic_file_direct_IO()
> + *
> + * So let's wait for the current transaction to finish flush of
> + * dirty data buffers before we try to free those buffers
> + * again. This wait is needed by direct IO code path only,
> + * gfp_mask __GFP_REPEAT is passed from the direct IO code
> + * path to flag if we need to wait and retry free buffers.
> + */
> + if (ret == 0 && gfp_mask & __GFP_REPEAT) {
> + spin_lock(&journal->j_state_lock);
> + journal_wait_for_transaction_sync_data(journal);
> + ret = try_to_free_buffers(page);
> + spin_unlock(&journal->j_state_lock);
> + }
> +
> busy:
> return ret;
> }
> Index: linux-2.6.26-rc2/mm/truncate.c
> ===================================================================
> --- linux-2.6.26-rc2.orig/mm/truncate.c 2008-05-16 11:51:02.000000000 -0700
> +++ linux-2.6.26-rc2/mm/truncate.c 2008-05-16 13:42:18.000000000 -0700
> @@ -346,7 +346,8 @@ invalidate_complete_page2(struct address
> if (page->mapping != mapping)
> return 0;
>
> - if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
> + if (PagePrivate(page) &&
> + !try_to_release_page(page,GFP_KERNEL|__GFP_REPEAT))
> return 0;
>
> write_lock_irq(&mapping->tree_lock);
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
Honza
--
Jan Kara <[email protected]>
SuSE CR Labs
On Mon, 2008-05-19 at 00:37 +0200, Jan Kara wrote:
> Hi,
>
> > This patch fixed a few races between direct IO and kjournald commit
> > transaction. An unexpected EIO error gets returned to direct IO
> > caller when it failed to free those data buffers. This could be
> > reproduced easily with parallel direct write and buffered write to the
> > same file
> >
> > More specific, those races could cause journal_try_to_free_buffers()
> > fail to free the data buffers, when jbd is committing the transaction
> > that has those data buffers on its t_syncdata_list or t_locked_list.
> > journal_commit_transaction() still holds the reference to those
> > buffers before data reach to disk and buffers are removed from the
> > t_syncdata_list of t_locked_list. This prevent the concurrent
> > journal_try_to_free_buffers() to free those buffers at the same time,
> > but cause EIO error returns back to direct IO.
> >
> > With this patch, in case of direct IO and when try_to_free_buffers() failed,
> > let's waiting for journal_commit_transaction() to finish
> > flushing the current committing transaction's data buffers to disk,
> > then try to free those buffers again.
> If Andrew or Christoph wouldn't beat you for "inventive use" of
> gfp_mask, I'm fine with the patch as well ;). You can add
> Acked-by: Jan Kara <[email protected]>
>
This is less intrusive way to fix this problem. The gfp_mask was marked
as unused in try_to_free_page(). I looked at filesystems in the kernel,
there is only a few defined releasepage() callback, and only xfs checks
the flag(but not used). btrfs is actually using it though. I thought
about the way you have suggested, i.e.clean up this gfp_mask and and
replace with a flag. I am not entirely sure if it we need to change the
address_space_operations and fix all the filesystems for this matter.
Andrew, what do you think? Is this approach acceptable?
Thanks and regards,
Mingming
> >
> > Signed-off-by: Mingming Cao <[email protected]>
> > Reviewed-by: Badari Pulavarty <[email protected]>
> > ---
> > fs/jbd/transaction.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > mm/truncate.c | 3 +-
> > 2 files changed, 57 insertions(+), 3 deletions(-)
> >
> > Index: linux-2.6.26-rc2/fs/jbd/transaction.c
> > ===================================================================
> > --- linux-2.6.26-rc2.orig/fs/jbd/transaction.c 2008-05-16 11:51:02.000000000 -0700
> > +++ linux-2.6.26-rc2/fs/jbd/transaction.c 2008-05-16 13:43:02.000000000 -0700
> > @@ -1648,12 +1648,39 @@ out:
> > return;
> > }
> >
> > +/*
> > + * journal_try_to_free_buffers() could race with journal_commit_transaction()
> > + * The later might still hold the reference count to the buffers when inspecting
> > + * them on t_syncdata_list or t_locked_list.
> > + *
> > + * Journal_try_to_free_buffers() will call this function to
> > + * wait for the current transaction to finish syncing data buffers, before
> > + * try to free that buffer.
> > + *
> > + * Called with journal->j_state_lock hold.
> > + */
> > +static void journal_wait_for_transaction_sync_data(journal_t *journal)
> > +{
> > + transaction_t *transaction = NULL;
> > + tid_t tid;
> > +
> > + transaction = journal->j_committing_transaction;
> > +
> > + if (!transaction)
> > + return;
> > +
> > + tid = transaction->t_tid;
> > + spin_unlock(&journal->j_state_lock);
> > + log_wait_commit(journal, tid);
> > + spin_lock(&journal->j_state_lock);
> > +}
> >
> > /**
> > * int journal_try_to_free_buffers() - try to free page buffers.
> > * @journal: journal for operation
> > * @page: to try and free
> > - * @unused_gfp_mask: unused
> > + * @gfp_mask: unused for allocation purpose. Here is used
> > + * as a flag to tell if direct IO is attemping to free buffers.
> > *
> > *
> > * For all the buffers on this page,
> > @@ -1682,9 +1709,11 @@ out:
> > * journal_try_to_free_buffer() is changing its state. But that
> > * cannot happen because we never reallocate freed data as metadata
> > * while the data is part of a transaction. Yes?
> > + *
> > + * Return 0 on failure, 1 on success
> > */
> > int journal_try_to_free_buffers(journal_t *journal,
> > - struct page *page, gfp_t unused_gfp_mask)
> > + struct page *page, gfp_t gfp_mask)
> > {
> > struct buffer_head *head;
> > struct buffer_head *bh;
> > @@ -1713,7 +1742,31 @@ int journal_try_to_free_buffers(journal_
> > if (buffer_jbd(bh))
> > goto busy;
> > } while ((bh = bh->b_this_page) != head);
> > +
> > ret = try_to_free_buffers(page);
> > +
> > + /*
> > + * In the case of concurrent direct IO and buffered IO,
> > + * There are a number of places where we
> > + * could race with journal_commit_transaction(), the later still
> > + * holds the reference to the buffers to free while processing them.
> > + * try_to_free_buffers() failed to free those buffers,
> > + * resulting in an unexpected EIO error
> > + * returns back to the generic_file_direct_IO()
> > + *
> > + * So let's wait for the current transaction to finish flush of
> > + * dirty data buffers before we try to free those buffers
> > + * again. This wait is needed by direct IO code path only,
> > + * gfp_mask __GFP_REPEAT is passed from the direct IO code
> > + * path to flag if we need to wait and retry free buffers.
> > + */
> > + if (ret == 0 && gfp_mask & __GFP_REPEAT) {
> > + spin_lock(&journal->j_state_lock);
> > + journal_wait_for_transaction_sync_data(journal);
> > + ret = try_to_free_buffers(page);
> > + spin_unlock(&journal->j_state_lock);
> > + }
> > +
> > busy:
> > return ret;
> > }
> > Index: linux-2.6.26-rc2/mm/truncate.c
> > ===================================================================
> > --- linux-2.6.26-rc2.orig/mm/truncate.c 2008-05-16 11:51:02.000000000 -0700
> > +++ linux-2.6.26-rc2/mm/truncate.c 2008-05-16 13:42:18.000000000 -0700
> > @@ -346,7 +346,8 @@ invalidate_complete_page2(struct address
> > if (page->mapping != mapping)
> > return 0;
> >
> > - if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
> > + if (PagePrivate(page) &&
> > + !try_to_release_page(page,GFP_KERNEL|__GFP_REPEAT))
> > return 0;
> >
> > write_lock_irq(&mapping->tree_lock);
> >
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/
>
> Honza
On Mon, 19 May 2008 12:59:18 -0700
Mingming Cao <[email protected]> wrote:
> On Mon, 2008-05-19 at 00:37 +0200, Jan Kara wrote:
> > Hi,
> >
> > > This patch fixed a few races between direct IO and kjournald commit
> > > transaction. An unexpected EIO error gets returned to direct IO
> > > caller when it failed to free those data buffers. This could be
> > > reproduced easily with parallel direct write and buffered write to the
> > > same file
> > >
> > > More specific, those races could cause journal_try_to_free_buffers()
> > > fail to free the data buffers, when jbd is committing the transaction
> > > that has those data buffers on its t_syncdata_list or t_locked_list.
> > > journal_commit_transaction() still holds the reference to those
> > > buffers before data reach to disk and buffers are removed from the
> > > t_syncdata_list of t_locked_list. This prevent the concurrent
> > > journal_try_to_free_buffers() to free those buffers at the same time,
> > > but cause EIO error returns back to direct IO.
> > >
> > > With this patch, in case of direct IO and when try_to_free_buffers() failed,
> > > let's waiting for journal_commit_transaction() to finish
> > > flushing the current committing transaction's data buffers to disk,
> > > then try to free those buffers again.
> > If Andrew or Christoph wouldn't beat you for "inventive use" of
> > gfp_mask, I'm fine with the patch as well ;). You can add
> > Acked-by: Jan Kara <[email protected]>
> >
>
> This is less intrusive way to fix this problem. The gfp_mask was marked
> as unused in try_to_free_page(). I looked at filesystems in the kernel,
> there is only a few defined releasepage() callback, and only xfs checks
> the flag(but not used). btrfs is actually using it though. I thought
> about the way you have suggested, i.e.clean up this gfp_mask and and
> replace with a flag. I am not entirely sure if it we need to change the
> address_space_operations and fix all the filesystems for this matter.
>
> Andrew, what do you think? Is this approach acceptable?
>
<wakes up>
Please ensure that the final patch is sufficiently well changelogged to
permit me to remain asleep ;)
The ->releasepage semantics are fairly ad-hoc and have grown over time.
It'd be nice to prevent them from becoming vaguer than they are.
It has been (approximately?) the case that code paths which really care
about having the page released will set __GFP_WAIT (via GFP_KERNEL)
whereas code paths which are happy with best-effort will clear
__GFP_WAIT (with a "0'). And that's reasonsable - __GFP_WAIT here
means "be synchronous" whereas !__GFP_WAIT means "be non-blocking".
Is that old convention not sufficient here as well? Two problem areas
I see are mm/vmscan.c and fs/splice.c (there may be others).
In mm/vmscan.c we probably don't want your new synchronous behaviour
and it might well be deadlockable anyway. No probs, that's what
__GFP_FS is for.
In fs/splice.c, reading the comment there I have a feeling that you've
found another bug, and that splice _does_ want your new synchronous
behaviour?
On Mon, 2008-05-19 at 13:25 -0700, Andrew Morton wrote:
> On Mon, 19 May 2008 12:59:18 -0700
> Mingming Cao <[email protected]> wrote:
>
> > On Mon, 2008-05-19 at 00:37 +0200, Jan Kara wrote:
> > > Hi,
> > >
> > > > This patch fixed a few races between direct IO and kjournald commit
> > > > transaction. An unexpected EIO error gets returned to direct IO
> > > > caller when it failed to free those data buffers. This could be
> > > > reproduced easily with parallel direct write and buffered write to the
> > > > same file
> > > >
> > > > More specific, those races could cause journal_try_to_free_buffers()
> > > > fail to free the data buffers, when jbd is committing the transaction
> > > > that has those data buffers on its t_syncdata_list or t_locked_list.
> > > > journal_commit_transaction() still holds the reference to those
> > > > buffers before data reach to disk and buffers are removed from the
> > > > t_syncdata_list of t_locked_list. This prevent the concurrent
> > > > journal_try_to_free_buffers() to free those buffers at the same time,
> > > > but cause EIO error returns back to direct IO.
> > > >
> > > > With this patch, in case of direct IO and when try_to_free_buffers() failed,
> > > > let's waiting for journal_commit_transaction() to finish
> > > > flushing the current committing transaction's data buffers to disk,
> > > > then try to free those buffers again.
> > > If Andrew or Christoph wouldn't beat you for "inventive use" of
> > > gfp_mask, I'm fine with the patch as well ;). You can add
> > > Acked-by: Jan Kara <[email protected]>
> > >
> >
> > This is less intrusive way to fix this problem. The gfp_mask was marked
> > as unused in try_to_free_page(). I looked at filesystems in the kernel,
> > there is only a few defined releasepage() callback, and only xfs checks
> > the flag(but not used). btrfs is actually using it though. I thought
> > about the way you have suggested, i.e.clean up this gfp_mask and and
> > replace with a flag. I am not entirely sure if it we need to change the
> > address_space_operations and fix all the filesystems for this matter.
> >
> > Andrew, what do you think? Is this approach acceptable?
> >
>
> <wakes up>
>
> Please ensure that the final patch is sufficiently well changelogged to
> permit me to remain asleep ;)
:-)
> The ->releasepage semantics are fairly ad-hoc and have grown over time.
> It'd be nice to prevent them from becoming vaguer than they are.
>
> It has been (approximately?) the case that code paths which really care
> about having the page released will set __GFP_WAIT (via GFP_KERNEL)
> whereas code paths which are happy with best-effort will clear
> __GFP_WAIT (with a "0'). And that's reasonsable - __GFP_WAIT here
> means "be synchronous" whereas !__GFP_WAIT means "be non-blocking".
>
This make sense to me.
> Is that old convention not sufficient here as well? Two problem areas
> I see are mm/vmscan.c and fs/splice.c (there may be others).
>
> In mm/vmscan.c we probably don't want your new synchronous behaviour
> and it might well be deadlockable anyway. No probs, that's what
> __GFP_FS is for.
>
Sure. We could check __GFP_FS and __GFP_WAIT, and that make sense.
> In fs/splice.c, reading the comment there I have a feeling that you've
> found another bug, and that splice _does_ want your new synchronous
> behaviour?
Yes, it looks like page_cache_pipe_buf_steal() expects page is free
before removeing it by passing the GFP_KERNEL flag, but currently ext3
could fails to releasepage when it called. In fact try_to_release_page()
return value is ignored in page_cache_pipe_buf_steal(), should probably
checked the failure case.
The other caller of try_to_release_page() in mm/splice.c is
fallback_migrate_page(), which does want the synchronous behaviour to
make sure buffers are dropped.
I will reuse the GFP_WAIT and GFP_FS flag in the updated patch.
Thanks for your feedback.
Mingming
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, May 19 2008, Mingming Cao wrote:
> On Mon, 2008-05-19 at 13:25 -0700, Andrew Morton wrote:
> > On Mon, 19 May 2008 12:59:18 -0700
> > Mingming Cao <[email protected]> wrote:
> >
> > > On Mon, 2008-05-19 at 00:37 +0200, Jan Kara wrote:
> > > > Hi,
> > > >
> > > > > This patch fixed a few races between direct IO and kjournald commit
> > > > > transaction. An unexpected EIO error gets returned to direct IO
> > > > > caller when it failed to free those data buffers. This could be
> > > > > reproduced easily with parallel direct write and buffered write to the
> > > > > same file
> > > > >
> > > > > More specific, those races could cause journal_try_to_free_buffers()
> > > > > fail to free the data buffers, when jbd is committing the transaction
> > > > > that has those data buffers on its t_syncdata_list or t_locked_list.
> > > > > journal_commit_transaction() still holds the reference to those
> > > > > buffers before data reach to disk and buffers are removed from the
> > > > > t_syncdata_list of t_locked_list. This prevent the concurrent
> > > > > journal_try_to_free_buffers() to free those buffers at the same time,
> > > > > but cause EIO error returns back to direct IO.
> > > > >
> > > > > With this patch, in case of direct IO and when try_to_free_buffers() failed,
> > > > > let's waiting for journal_commit_transaction() to finish
> > > > > flushing the current committing transaction's data buffers to disk,
> > > > > then try to free those buffers again.
> > > > If Andrew or Christoph wouldn't beat you for "inventive use" of
> > > > gfp_mask, I'm fine with the patch as well ;). You can add
> > > > Acked-by: Jan Kara <[email protected]>
> > > >
> > >
> > > This is less intrusive way to fix this problem. The gfp_mask was marked
> > > as unused in try_to_free_page(). I looked at filesystems in the kernel,
> > > there is only a few defined releasepage() callback, and only xfs checks
> > > the flag(but not used). btrfs is actually using it though. I thought
> > > about the way you have suggested, i.e.clean up this gfp_mask and and
> > > replace with a flag. I am not entirely sure if it we need to change the
> > > address_space_operations and fix all the filesystems for this matter.
> > >
> > > Andrew, what do you think? Is this approach acceptable?
> > >
> >
> > <wakes up>
> >
> > Please ensure that the final patch is sufficiently well changelogged to
> > permit me to remain asleep ;)
> :-)
> > The ->releasepage semantics are fairly ad-hoc and have grown over time.
> > It'd be nice to prevent them from becoming vaguer than they are.
> >
> > It has been (approximately?) the case that code paths which really care
> > about having the page released will set __GFP_WAIT (via GFP_KERNEL)
> > whereas code paths which are happy with best-effort will clear
> > __GFP_WAIT (with a "0'). And that's reasonsable - __GFP_WAIT here
> > means "be synchronous" whereas !__GFP_WAIT means "be non-blocking".
> >
>
> This make sense to me.
>
> > Is that old convention not sufficient here as well? Two problem areas
> > I see are mm/vmscan.c and fs/splice.c (there may be others).
> >
>
> > In mm/vmscan.c we probably don't want your new synchronous behaviour
> > and it might well be deadlockable anyway. No probs, that's what
> > __GFP_FS is for.
> >
> Sure. We could check __GFP_FS and __GFP_WAIT, and that make sense.
>
> > In fs/splice.c, reading the comment there I have a feeling that you've
> > found another bug, and that splice _does_ want your new synchronous
> > behaviour?
>
> Yes, it looks like page_cache_pipe_buf_steal() expects page is free
> before removeing it by passing the GFP_KERNEL flag, but currently ext3
> could fails to releasepage when it called. In fact
> try_to_release_page() return value is ignored in
> page_cache_pipe_buf_steal(), should probably checked the failure case.
>
>
> The other caller of try_to_release_page() in mm/splice.c is
> fallback_migrate_page(), which does want the synchronous behaviour to
> make sure buffers are dropped.
So something like this, then?
diff --git a/fs/splice.c b/fs/splice.c
index 7815003..e08a2f5 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -58,8 +58,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
*/
wait_on_page_writeback(page);
- if (PagePrivate(page))
- try_to_release_page(page, GFP_KERNEL);
+ if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+ goto out_unlock;
/*
* If we succeeded in removing the mapping, set LRU flag
@@ -75,6 +75,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
* Raced with truncate or failed to remove page from current
* address space, unlock and return failure.
*/
+out_unlock:
unlock_page(page);
return 1;
}
--
Jens Axboe
On Tue, 2008-05-20 at 11:30 +0200, Jens Axboe wrote:
> On Mon, May 19 2008, Mingming Cao wrote:
> > On Mon, 2008-05-19 at 13:25 -0700, Andrew Morton wrote:
> > > On Mon, 19 May 2008 12:59:18 -0700
> > > Mingming Cao <[email protected]> wrote:
> > >
> > > > On Mon, 2008-05-19 at 00:37 +0200, Jan Kara wrote:
> > > > > Hi,
> > > > >
> > > > > > This patch fixed a few races between direct IO and kjournald commit
> > > > > > transaction. An unexpected EIO error gets returned to direct IO
> > > > > > caller when it failed to free those data buffers. This could be
> > > > > > reproduced easily with parallel direct write and buffered write to the
> > > > > > same file
> > > > > >
> > > > > > More specific, those races could cause journal_try_to_free_buffers()
> > > > > > fail to free the data buffers, when jbd is committing the transaction
> > > > > > that has those data buffers on its t_syncdata_list or t_locked_list.
> > > > > > journal_commit_transaction() still holds the reference to those
> > > > > > buffers before data reach to disk and buffers are removed from the
> > > > > > t_syncdata_list of t_locked_list. This prevent the concurrent
> > > > > > journal_try_to_free_buffers() to free those buffers at the same time,
> > > > > > but cause EIO error returns back to direct IO.
> > > > > >
> > > > > > With this patch, in case of direct IO and when try_to_free_buffers() failed,
> > > > > > let's waiting for journal_commit_transaction() to finish
> > > > > > flushing the current committing transaction's data buffers to disk,
> > > > > > then try to free those buffers again.
> > > > > If Andrew or Christoph wouldn't beat you for "inventive use" of
> > > > > gfp_mask, I'm fine with the patch as well ;). You can add
> > > > > Acked-by: Jan Kara <[email protected]>
> > > > >
> > > >
> > > > This is less intrusive way to fix this problem. The gfp_mask was marked
> > > > as unused in try_to_free_page(). I looked at filesystems in the kernel,
> > > > there is only a few defined releasepage() callback, and only xfs checks
> > > > the flag(but not used). btrfs is actually using it though. I thought
> > > > about the way you have suggested, i.e.clean up this gfp_mask and and
> > > > replace with a flag. I am not entirely sure if it we need to change the
> > > > address_space_operations and fix all the filesystems for this matter.
> > > >
> > > > Andrew, what do you think? Is this approach acceptable?
> > > >
> > >
> > > <wakes up>
> > >
> > > Please ensure that the final patch is sufficiently well changelogged to
> > > permit me to remain asleep ;)
> > :-)
> > > The ->releasepage semantics are fairly ad-hoc and have grown over time.
> > > It'd be nice to prevent them from becoming vaguer than they are.
> > >
> > > It has been (approximately?) the case that code paths which really care
> > > about having the page released will set __GFP_WAIT (via GFP_KERNEL)
> > > whereas code paths which are happy with best-effort will clear
> > > __GFP_WAIT (with a "0'). And that's reasonsable - __GFP_WAIT here
> > > means "be synchronous" whereas !__GFP_WAIT means "be non-blocking".
> > >
> >
> > This make sense to me.
> >
> > > Is that old convention not sufficient here as well? Two problem areas
> > > I see are mm/vmscan.c and fs/splice.c (there may be others).
> > >
> >
> > > In mm/vmscan.c we probably don't want your new synchronous behaviour
> > > and it might well be deadlockable anyway. No probs, that's what
> > > __GFP_FS is for.
> > >
> > Sure. We could check __GFP_FS and __GFP_WAIT, and that make sense.
> >
> > > In fs/splice.c, reading the comment there I have a feeling that you've
> > > found another bug, and that splice _does_ want your new synchronous
> > > behaviour?
> >
> > Yes, it looks like page_cache_pipe_buf_steal() expects page is free
> > before removeing it by passing the GFP_KERNEL flag, but currently ext3
> > could fails to releasepage when it called. In fact
> > try_to_release_page() return value is ignored in
> > page_cache_pipe_buf_steal(), should probably checked the failure case.
> >
> >
> > The other caller of try_to_release_page() in mm/splice.c is
> > fallback_migrate_page(), which does want the synchronous behaviour to
> > make sure buffers are dropped.
>
> So something like this, then?
>
Acked-by: Mingming Cao <[email protected]>
> diff --git a/fs/splice.c b/fs/splice.c
> index 7815003..e08a2f5 100644
> --- a/fs/splice.c
> +++ b/fs/splice.c
> @@ -58,8 +58,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
> */
> wait_on_page_writeback(page);
>
> - if (PagePrivate(page))
> - try_to_release_page(page, GFP_KERNEL);
> + if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
> + goto out_unlock;
>
> /*
> * If we succeeded in removing the mapping, set LRU flag
> @@ -75,6 +75,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
> * Raced with truncate or failed to remove page from current
> * address space, unlock and return failure.
> */
> +out_unlock:
> unlock_page(page);
> return 1;
> }
>
Updated patch to use existing GFP_KERNEL mask to indicating synchornous
wait in journal_try_to_free_buffers() to resolve the race with
journal_commit_transaction().
JBD: fix race between journal_try_to_free_buffers() and jbd commit transaction
From: Mingming Cao <[email protected]>
journal_try_to_free_buffers() could race with jbd commit transaction when
the later is holding the buffer reference while waiting for the data buffer
to flush to disk. If the caller of journal_try_to_free_buffers() request
tries hard to release the buffers, it will treat the failure as error and return
back to the caller. We have seen the directo IO failed due to this race.
Some of the caller of releasepage() also expecting the buffer to be dropped
when passed with GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers().
With this patch, if the caller is passing the GFP_KERNEL to indicating this
call could wait, in case of try_to_free_buffers() failed, let's waiting for
journal_commit_transaction() to finish commit the current committing transaction
, then try to free those buffers again with journal locked.
Signed-off-by: Mingming Cao <[email protected]>
Reviewed-by: Badari Pulavarty <[email protected]>
---
fs/jbd/transaction.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++--
mm/filemap.c | 3 --
2 files changed, 54 insertions(+), 4 deletions(-)
Index: linux-2.6.26-rc2/fs/jbd/transaction.c
===================================================================
--- linux-2.6.26-rc2.orig/fs/jbd/transaction.c 2008-05-11 17:09:41.000000000 -0700
+++ linux-2.6.26-rc2/fs/jbd/transaction.c 2008-05-19 16:16:41.000000000 -0700
@@ -1648,12 +1648,39 @@ out:
return;
}
+/*
+ * journal_try_to_free_buffers() could race with journal_commit_transaction()
+ * The later might still hold the reference count to the buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * Journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+ tid_t tid;
+
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction)
+ return;
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ log_wait_commit(journal, tid);
+ spin_lock(&journal->j_state_lock);
+}
/**
* int journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: specifies whether the call may block
+ * (__GFP_WAIT & __GFP_FS via GFP_KERNEL)
*
*
* For all the buffers on this page,
@@ -1682,9 +1709,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1713,7 +1742,30 @@ int journal_try_to_free_buffers(journal_
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where journal_try_to_free_buffers()
+ * could race with journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && gfp_mask & GFP_KERNEL) {
+ spin_lock(&journal->j_state_lock);
+ journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ spin_unlock(&journal->j_state_lock);
+ }
+
busy:
return ret;
}
Index: linux-2.6.26-rc2/mm/filemap.c
===================================================================
--- linux-2.6.26-rc2.orig/mm/filemap.c 2008-05-19 16:00:01.000000000 -0700
+++ linux-2.6.26-rc2/mm/filemap.c 2008-05-19 16:01:34.000000000 -0700
@@ -2581,9 +2581,8 @@ out:
* Otherwise return zero.
*
* The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
*
- * NOTE: @gfp_mask may go away, and this function may become non-blocking.
*/
int try_to_release_page(struct page *page, gfp_t gfp_mask)
{
JBD2: fix race between jbd2_journal_try_to_free_buffers() and jbd2 commit transaction
From: Mingming Cao <[email protected]>
journal_try_to_free_buffers() could race with jbd commit transaction when
the later is holding the buffer reference while waiting for the data buffer
to flush to disk. If the caller of journal_try_to_free_buffers() request
tries hard to release the buffers, it will treat the failure as error and return
back to the caller. We have seen the directo IO failed due to this race.
Some of the caller of releasepage() also expecting the buffer to be dropped
when passed with GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers().
With this patch, if the caller is passing the GFP_KERNEL to indicating this
call could wait, in case of try_to_free_buffers() failed, let's waiting for
journal_commit_transaction() to finish commit the current committing transaction
, then try to free those buffers again with journal locked.
Signed-off-by: Mingming Cao <[email protected]>
Reviewed-by: Badari Pulavarty <[email protected]>
---
fs/jbd2/transaction.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 54 insertions(+), 2 deletions(-)
Index: linux-2.6.26-rc2/fs/jbd2/transaction.c
===================================================================
--- linux-2.6.26-rc2.orig/fs/jbd2/transaction.c 2008-05-11 17:09:41.000000000 -0700
+++ linux-2.6.26-rc2/fs/jbd2/transaction.c 2008-05-19 17:20:01.000000000 -0700
@@ -1656,12 +1656,39 @@ out:
return;
}
+/*
+ * jbd2_journal_try_to_free_buffers() could race with jbd2_journal_commit_transaction()
+ * The later might still hold the reference count to the buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+ tid_t tid;
+
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction)
+ return;
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+ spin_lock(&journal->j_state_lock);
+}
/**
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: specifies whether the call may block
+ * (__GFP_WAIT & __GFP_FS via GFP_KERNEL)
*
*
* For all the buffers on this page,
@@ -1690,9 +1717,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1721,7 +1750,30 @@ int jbd2_journal_try_to_free_buffers(jou
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where jbd2_journal_try_to_free_buffers()
+ * could race with jbd2_journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && gfp_mask & GFP_KERNEL) {
+ spin_lock(&journal->j_state_lock);
+ jbd2_journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ spin_unlock(&journal->j_state_lock);
+ }
+
busy:
return ret;
}
> JBD: fix race between journal_try_to_free_buffers() and jbd commit transaction
>
> From: Mingming Cao <[email protected]>
>
> journal_try_to_free_buffers() could race with jbd commit transaction when
> the later is holding the buffer reference while waiting for the data buffer
> to flush to disk. If the caller of journal_try_to_free_buffers() request
> tries hard to release the buffers, it will treat the failure as error and return
> back to the caller. We have seen the directo IO failed due to this race.
> Some of the caller of releasepage() also expecting the buffer to be dropped
> when passed with GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers().
>
> With this patch, if the caller is passing the GFP_KERNEL to indicating this
> call could wait, in case of try_to_free_buffers() failed, let's waiting for
> journal_commit_transaction() to finish commit the current committing transaction
> , then try to free those buffers again with journal locked.
>
> Signed-off-by: Mingming Cao <[email protected]>
> Reviewed-by: Badari Pulavarty <[email protected]>
> ---
> fs/jbd/transaction.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++--
> mm/filemap.c | 3 --
> 2 files changed, 54 insertions(+), 4 deletions(-)
>
> Index: linux-2.6.26-rc2/fs/jbd/transaction.c
> ===================================================================
> --- linux-2.6.26-rc2.orig/fs/jbd/transaction.c 2008-05-11 17:09:41.000000000 -0700
> +++ linux-2.6.26-rc2/fs/jbd/transaction.c 2008-05-19 16:16:41.000000000 -0700
> @@ -1648,12 +1648,39 @@ out:
> return;
> }
>
> +/*
> + * journal_try_to_free_buffers() could race with journal_commit_transaction()
> + * The later might still hold the reference count to the buffers when inspecting
> + * them on t_syncdata_list or t_locked_list.
> + *
> + * Journal_try_to_free_buffers() will call this function to
> + * wait for the current transaction to finish syncing data buffers, before
> + * try to free that buffer.
> + *
> + * Called with journal->j_state_lock hold.
> + */
> +static void journal_wait_for_transaction_sync_data(journal_t *journal)
> +{
> + transaction_t *transaction = NULL;
> + tid_t tid;
> +
> + transaction = journal->j_committing_transaction;
> +
> + if (!transaction)
> + return;
> +
> + tid = transaction->t_tid;
> + spin_unlock(&journal->j_state_lock);
> + log_wait_commit(journal, tid);
> + spin_lock(&journal->j_state_lock);
> +}
What is actually the point of entering the function with j_state_lock
held and also keeping it after return? It should be enough to take it
and release it just inside this function, shouldn't it?
> /**
> * int journal_try_to_free_buffers() - try to free page buffers.
> * @journal: journal for operation
> * @page: to try and free
> - * @unused_gfp_mask: unused
> + * @gfp_mask: specifies whether the call may block
> + * (__GFP_WAIT & __GFP_FS via GFP_KERNEL)
This comment seems a bit misleading to me - I'd rather write there:
@gfp_mask: we use the mask to detect how hard should we try to release
buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
release the buffers.
> *
> *
> * For all the buffers on this page,
> @@ -1682,9 +1709,11 @@ out:
> * journal_try_to_free_buffer() is changing its state. But that
> * cannot happen because we never reallocate freed data as metadata
> * while the data is part of a transaction. Yes?
> + *
> + * Return 0 on failure, 1 on success
> */
> int journal_try_to_free_buffers(journal_t *journal,
> - struct page *page, gfp_t unused_gfp_mask)
> + struct page *page, gfp_t gfp_mask)
> {
> struct buffer_head *head;
> struct buffer_head *bh;
> @@ -1713,7 +1742,30 @@ int journal_try_to_free_buffers(journal_
> if (buffer_jbd(bh))
> goto busy;
> } while ((bh = bh->b_this_page) != head);
> +
> ret = try_to_free_buffers(page);
> +
> + /*
> + * There are a number of places where journal_try_to_free_buffers()
> + * could race with journal_commit_transaction(), the later still
> + * holds the reference to the buffers to free while processing them.
> + * try_to_free_buffers() failed to free those buffers. Some of the
> + * caller of releasepage() request page buffers to be dropped, otherwise
> + * treat the fail-to-free as errors (such as generic_file_direct_IO())
> + *
> + * So, if the caller of try_to_release_page() wants the synchronous
> + * behaviour(i.e make sure buffers are dropped upon return),
> + * let's wait for the current transaction to finish flush of
> + * dirty data buffers, then try to free those buffers again,
> + * with the journal locked.
> + */
> + if (ret == 0 && gfp_mask & GFP_KERNEL) {
I think this test is wrong - it should rather be something like
(ret == 0 && (gfp_mask & GFP_KERNEL == GFP_KERNEL)) - or even expand the
test to gfp_mask & __GFP_WAIT && gfp_mask & __GFP_FS && gfp_mask &
__GFP_IO.
> + spin_lock(&journal->j_state_lock);
> + journal_wait_for_transaction_sync_data(journal);
> + ret = try_to_free_buffers(page);
> + spin_unlock(&journal->j_state_lock);
> + }
> +
> busy:
> return ret;
> }
> Index: linux-2.6.26-rc2/mm/filemap.c
> ===================================================================
> --- linux-2.6.26-rc2.orig/mm/filemap.c 2008-05-19 16:00:01.000000000 -0700
> +++ linux-2.6.26-rc2/mm/filemap.c 2008-05-19 16:01:34.000000000 -0700
> @@ -2581,9 +2581,8 @@ out:
> * Otherwise return zero.
> *
> * The @gfp_mask argument specifies whether I/O may be performed to release
> - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
> + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
Probably __GFP_WAIT | __GFP_IO here... But I'm not sure why do we
really change this...
> *
> - * NOTE: @gfp_mask may go away, and this function may become non-blocking.
> */
> int try_to_release_page(struct page *page, gfp_t gfp_mask)
> {
>
Honza
--
Jan Kara <[email protected]>
SuSE CR Labs
On Wed, 2008-05-21 at 01:53 +0200, Jan Kara wrote:
> > JBD: fix race between journal_try_to_free_buffers() and jbd commit transaction
> >
> > From: Mingming Cao <[email protected]>
> >
> > journal_try_to_free_buffers() could race with jbd commit transaction when
> > the later is holding the buffer reference while waiting for the data buffer
> > to flush to disk. If the caller of journal_try_to_free_buffers() request
> > tries hard to release the buffers, it will treat the failure as error and return
> > back to the caller. We have seen the directo IO failed due to this race.
> > Some of the caller of releasepage() also expecting the buffer to be dropped
> > when passed with GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers().
> >
> > With this patch, if the caller is passing the GFP_KERNEL to indicating this
> > call could wait, in case of try_to_free_buffers() failed, let's waiting for
> > journal_commit_transaction() to finish commit the current committing transaction
> > , then try to free those buffers again with journal locked.
> >
> > Signed-off-by: Mingming Cao <[email protected]>
> > Reviewed-by: Badari Pulavarty <[email protected]>
> > ---
> > fs/jbd/transaction.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > mm/filemap.c | 3 --
> > 2 files changed, 54 insertions(+), 4 deletions(-)
> >
> > Index: linux-2.6.26-rc2/fs/jbd/transaction.c
> > ===================================================================
> > --- linux-2.6.26-rc2.orig/fs/jbd/transaction.c 2008-05-11 17:09:41.000000000 -0700
> > +++ linux-2.6.26-rc2/fs/jbd/transaction.c 2008-05-19 16:16:41.000000000 -0700
> > @@ -1648,12 +1648,39 @@ out:
> > return;
> > }
> >
> > +/*
> > + * journal_try_to_free_buffers() could race with journal_commit_transaction()
> > + * The later might still hold the reference count to the buffers when inspecting
> > + * them on t_syncdata_list or t_locked_list.
> > + *
> > + * Journal_try_to_free_buffers() will call this function to
> > + * wait for the current transaction to finish syncing data buffers, before
> > + * try to free that buffer.
> > + *
> > + * Called with journal->j_state_lock hold.
> > + */
> > +static void journal_wait_for_transaction_sync_data(journal_t *journal)
> > +{
> > + transaction_t *transaction = NULL;
> > + tid_t tid;
> > +
> > + transaction = journal->j_committing_transaction;
> > +
> > + if (!transaction)
> > + return;
> > +
> > + tid = transaction->t_tid;
> > + spin_unlock(&journal->j_state_lock);
> > + log_wait_commit(journal, tid);
> > + spin_lock(&journal->j_state_lock);
> > +}
> What is actually the point of entering the function with j_state_lock
> held and also keeping it after return? It should be enough to take it
> and release it just inside this function, shouldn't it?
>
I was worried about the case when we call try_to_free_buffers() again,
it races with the current transaction commit again. Is it possible? I
guess the question is whether it is possible to have buffers on the same
page attached to different transaction. If so, I think we need to keep
the journal state lock while retry try_to_free_buffers(), so that the
retry won't race with the commit transaction again...
> > /**
> > * int journal_try_to_free_buffers() - try to free page buffers.
> > * @journal: journal for operation
> > * @page: to try and free
> > - * @unused_gfp_mask: unused
> > + * @gfp_mask: specifies whether the call may block
> > + * (__GFP_WAIT & __GFP_FS via GFP_KERNEL)
> This comment seems a bit misleading to me - I'd rather write there:
>
> @gfp_mask: we use the mask to detect how hard should we try to release
> buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
> release the buffers.
>
Sure.
> > *
> > *
> > * For all the buffers on this page,
> > @@ -1682,9 +1709,11 @@ out:
> > * journal_try_to_free_buffer() is changing its state. But that
> > * cannot happen because we never reallocate freed data as metadata
> > * while the data is part of a transaction. Yes?
> > + *
> > + * Return 0 on failure, 1 on success
> > */
> > int journal_try_to_free_buffers(journal_t *journal,
> > - struct page *page, gfp_t unused_gfp_mask)
> > + struct page *page, gfp_t gfp_mask)
> > {
> > struct buffer_head *head;
> > struct buffer_head *bh;
> > @@ -1713,7 +1742,30 @@ int journal_try_to_free_buffers(journal_
> > if (buffer_jbd(bh))
> > goto busy;
> > } while ((bh = bh->b_this_page) != head);
> > +
> > ret = try_to_free_buffers(page);
> > +
> > + /*
> > + * There are a number of places where journal_try_to_free_buffers()
> > + * could race with journal_commit_transaction(), the later still
> > + * holds the reference to the buffers to free while processing them.
> > + * try_to_free_buffers() failed to free those buffers. Some of the
> > + * caller of releasepage() request page buffers to be dropped, otherwise
> > + * treat the fail-to-free as errors (such as generic_file_direct_IO())
> > + *
> > + * So, if the caller of try_to_release_page() wants the synchronous
> > + * behaviour(i.e make sure buffers are dropped upon return),
> > + * let's wait for the current transaction to finish flush of
> > + * dirty data buffers, then try to free those buffers again,
> > + * with the journal locked.
> > + */
> > + if (ret == 0 && gfp_mask & GFP_KERNEL) {
> I think this test is wrong - it should rather be something like
> (ret == 0 && (gfp_mask & GFP_KERNEL == GFP_KERNEL)) - or even expand the
> test to gfp_mask & __GFP_WAIT && gfp_mask & __GFP_FS && gfp_mask &
> __GFP_IO.
>
Thanks for pointing this out.
> > + spin_lock(&journal->j_state_lock);
> > + journal_wait_for_transaction_sync_data(journal);
> > + ret = try_to_free_buffers(page);
> > + spin_unlock(&journal->j_state_lock);
> > + }
> > +
> > busy:
> > return ret;
> > }
> > Index: linux-2.6.26-rc2/mm/filemap.c
> > ===================================================================
> > --- linux-2.6.26-rc2.orig/mm/filemap.c 2008-05-19 16:00:01.000000000 -0700
> > +++ linux-2.6.26-rc2/mm/filemap.c 2008-05-19 16:01:34.000000000 -0700
> > @@ -2581,9 +2581,8 @@ out:
> > * Otherwise return zero.
> > *
> > * The @gfp_mask argument specifies whether I/O may be performed to release
> > - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
> > + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
> Probably __GFP_WAIT | __GFP_IO here... But I'm not sure why do we
> really change this...
>
For try_to_release_page(),we should wait only when (__GFP_WAIT &
__GFP_FS), isn't it?
> > *
> > - * NOTE: @gfp_mask may go away, and this function may become non-blocking.
> > */
> > int try_to_release_page(struct page *page, gfp_t gfp_mask)
> > {
> >
>
> Honza
Thanks, patch v3 to follow.
Mingming
Changes since take 2:
- fix a bug pointed by Jan, and updated the comments
journal_try_to_free_buffers() could race with jbd commit transaction when
the later is holding the buffer reference while waiting for the data buffer
to flush to disk. If the caller of journal_try_to_free_buffers() request
tries hard to release the buffers, it will treat the failure as error and return
back to the caller. We have seen the directo IO failed due to this race.
Some of the caller of releasepage() also expecting the buffer to be dropped
when passed with GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers().
With this patch, if the caller is passing the GFP_KERNEL to indicating this
call could wait, in case of try_to_free_buffers() failed, let's waiting for
journal_commit_transaction() to finish commit the current committing transaction
, then try to free those buffers again with journal locked.
Signed-off-by: Mingming Cao <[email protected]>
Reviewed-by: Badari Pulavarty <[email protected]>
---
fs/jbd/transaction.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++--
mm/filemap.c | 3 --
2 files changed, 56 insertions(+), 4 deletions(-)
Index: linux-2.6.26-rc3/fs/jbd/transaction.c
===================================================================
--- linux-2.6.26-rc3.orig/fs/jbd/transaction.c 2008-05-21 16:17:51.000000000 -0700
+++ linux-2.6.26-rc3/fs/jbd/transaction.c 2008-05-21 16:20:11.000000000 -0700
@@ -1648,12 +1648,40 @@ out:
return;
}
+/*
+ * journal_try_to_free_buffers() could race with journal_commit_transaction()
+ * The later might still hold the reference count to the buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * Journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+ tid_t tid;
+
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction)
+ return;
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ log_wait_commit(journal, tid);
+ spin_lock(&journal->j_state_lock);
+}
/**
* int journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1682,9 +1710,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1713,7 +1743,30 @@ int journal_try_to_free_buffers(journal_
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where journal_try_to_free_buffers()
+ * could race with journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && (gfp_mask & GFP_KERNEL == GFP_KERNEL)) {
+ spin_lock(&journal->j_state_lock);
+ journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ spin_unlock(&journal->j_state_lock);
+ }
+
busy:
return ret;
}
Index: linux-2.6.26-rc3/mm/filemap.c
===================================================================
--- linux-2.6.26-rc3.orig/mm/filemap.c 2008-05-21 16:17:51.000000000 -0700
+++ linux-2.6.26-rc3/mm/filemap.c 2008-05-21 16:17:58.000000000 -0700
@@ -2581,9 +2581,8 @@ out:
* Otherwise return zero.
*
* The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
*
- * NOTE: @gfp_mask may go away, and this function may become non-blocking.
*/
int try_to_release_page(struct page *page, gfp_t gfp_mask)
{
journal_try_to_free_buffers() could race with jbd commit transaction when
the later is holding the buffer reference while waiting for the data buffer
to flush to disk. If the caller of journal_try_to_free_buffers() request
tries hard to release the buffers, it will treat the failure as error and return
back to the caller. We have seen the directo IO failed due to this race.
Some of the caller of releasepage() also expecting the buffer to be dropped
when passed with GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers().
With this patch, if the caller is passing the GFP_KERNEL to indicating this
call could wait, in case of try_to_free_buffers() failed, let's waiting for
journal_commit_transaction() to finish commit the current committing transaction
, then try to free those buffers again with journal locked.
Signed-off-by: Mingming Cao <[email protected]>
Reviewed-by: Badari Pulavarty <[email protected]>
---
fs/jbd2/transaction.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 55 insertions(+), 2 deletions(-)
Index: linux-2.6.26-rc3/fs/jbd2/transaction.c
===================================================================
--- linux-2.6.26-rc3.orig/fs/jbd2/transaction.c 2008-05-21 16:17:51.000000000 -0700
+++ linux-2.6.26-rc3/fs/jbd2/transaction.c 2008-05-21 16:22:03.000000000 -0700
@@ -1656,12 +1656,40 @@ out:
return;
}
+/*
+ * jbd2_journal_try_to_free_buffers() could race with jbd2_journal_commit_transaction()
+ * The later might still hold the reference count to the buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+ tid_t tid;
+
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction)
+ return;
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+ spin_lock(&journal->j_state_lock);
+}
/**
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1690,9 +1718,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1721,7 +1751,30 @@ int jbd2_journal_try_to_free_buffers(jou
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where jbd2_journal_try_to_free_buffers()
+ * could race with jbd2_journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && (gfp_mask & GFP_KERNEL == GFP_KERNEL)) {
+ spin_lock(&journal->j_state_lock);
+ jbd2_journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ spin_unlock(&journal->j_state_lock);
+ }
+
busy:
return ret;
}
On Wed, 21 May 2008 16:38:07 -0700 Mingming <[email protected]> wrote:
>
> Subject: [PATCH 1/2][TAKE3] JBD: Fix race between free buffer and commit trasanction
"fix race between buffer freeing and transaction commit", perhaps.
> Changes since take 2:
> - fix a bug pointed by Jan, and updated the comments
>
>
> journal_try_to_free_buffers() could race with jbd commit transaction when
> the later is holding the buffer reference while waiting for the data buffer
> to flush to disk. If the caller of journal_try_to_free_buffers() request
> tries hard to release the buffers, it will treat the failure as error and return
> back to the caller. We have seen the directo IO failed due to this race.
> Some of the caller of releasepage() also expecting the buffer to be dropped
> when passed with GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers().
>
> With this patch, if the caller is passing the GFP_KERNEL to indicating this
> call could wait, in case of try_to_free_buffers() failed, let's waiting for
> journal_commit_transaction() to finish commit the current committing transaction
> , then try to free those buffers again with journal locked.
>
> Signed-off-by: Mingming Cao <[email protected]>
> Reviewed-by: Badari Pulavarty <[email protected]>
> ---
> fs/jbd/transaction.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++--
> mm/filemap.c | 3 --
> 2 files changed, 56 insertions(+), 4 deletions(-)
>
> Index: linux-2.6.26-rc3/fs/jbd/transaction.c
> ===================================================================
> --- linux-2.6.26-rc3.orig/fs/jbd/transaction.c 2008-05-21 16:17:51.000000000 -0700
> +++ linux-2.6.26-rc3/fs/jbd/transaction.c 2008-05-21 16:20:11.000000000 -0700
> @@ -1648,12 +1648,40 @@ out:
> return;
> }
>
> +/*
> + * journal_try_to_free_buffers() could race with journal_commit_transaction()
> + * The later might still hold the reference count to the buffers when inspecting
"latter"
"hold a reference on"
> + * them on t_syncdata_list or t_locked_list.
> + *
> + * Journal_try_to_free_buffers() will call this function to
"journal_try_to_free_buffers"
> + * wait for the current transaction to finish syncing data buffers, before
> + * try to free that buffer.
"trying"
> + *
> + * Called with journal->j_state_lock hold.
"held"
> + */
> +static void journal_wait_for_transaction_sync_data(journal_t *journal)
> +{
> + transaction_t *transaction = NULL;
Unneeded initialisation. Could just do
transaction_t *transaction = journal->j_committing_transaction;
> + tid_t tid;
> +
> + transaction = journal->j_committing_transaction;
> +
> + if (!transaction)
> + return;
> +
> + tid = transaction->t_tid;
> + spin_unlock(&journal->j_state_lock);
> + log_wait_commit(journal, tid);
> + spin_lock(&journal->j_state_lock);
> +}
>
> /**
> * int journal_try_to_free_buffers() - try to free page buffers.
> * @journal: journal for operation
> * @page: to try and free
> - * @unused_gfp_mask: unused
> + * @gfp_mask: we use the mask to detect how hard should we try to release
> + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
> + * release the buffers.
> *
> *
> * For all the buffers on this page,
> @@ -1682,9 +1710,11 @@ out:
> * journal_try_to_free_buffer() is changing its state. But that
> * cannot happen because we never reallocate freed data as metadata
> * while the data is part of a transaction. Yes?
> + *
> + * Return 0 on failure, 1 on success
> */
> int journal_try_to_free_buffers(journal_t *journal,
> - struct page *page, gfp_t unused_gfp_mask)
> + struct page *page, gfp_t gfp_mask)
> {
> struct buffer_head *head;
> struct buffer_head *bh;
> @@ -1713,7 +1743,30 @@ int journal_try_to_free_buffers(journal_
> if (buffer_jbd(bh))
> goto busy;
> } while ((bh = bh->b_this_page) != head);
> +
> ret = try_to_free_buffers(page);
> +
> + /*
> + * There are a number of places where journal_try_to_free_buffers()
> + * could race with journal_commit_transaction(), the later still
> + * holds the reference to the buffers to free while processing them.
"the latter still holds a reference on the buffers"
> + * try_to_free_buffers() failed to free those buffers. Some of the
> + * caller of releasepage() request page buffers to be dropped, otherwise
"callers"
"request the"
> * treat the fail-to-free as errors (such as generic_file_direct_IO())
> + *
> + * So, if the caller of try_to_release_page() wants the synchronous
> + * behaviour(i.e make sure buffers are dropped upon return),
> + * let's wait for the current transaction to finish flush of
"the flush"
> + * dirty data buffers, then try to free those buffers again,
> + * with the journal locked.
> + */
> + if (ret == 0 && (gfp_mask & GFP_KERNEL == GFP_KERNEL)) {
Sorry about all the spelling flames ;) I'd normally just fix them
myself rather than typing them all into an email and having you type
them in again, etc. But I think the patch needs to be respun anyway.
The mask-and-compare with GFP_KERNEL does appear to be correct, but it
is quite unusual. Generally in a situation like this we will test for
the specific __GFP_foo flags which we're interested in. For
documentation reasons if nothing else.
So the preferred form here would be
if (ret == 0 &&
(gfp_mask & (__GFP_WAIT|__GFP_FS)) == (__GFP_WAIT|__GFP_FS)) {
which really tells the reader what we're trying to do here. And I
don't think this code cares about __GFP_IO, even though it would be
mighty peculirr (probably buggy) for someone to do
alloc_pages(__GFP_FS|__GFP_WAIT).
> + spin_lock(&journal->j_state_lock);
> + journal_wait_for_transaction_sync_data(journal);
> + ret = try_to_free_buffers(page);
> + spin_unlock(&journal->j_state_lock);
> + }
Did we actually need to hold j_state_lock across the
try_to_free_buffers() call here? Because it'll increase hold times and
will introduce a lock-ranking dependency which we might not otherwise
have had (I didn't check).
> busy:
> return ret;
> }
> Index: linux-2.6.26-rc3/mm/filemap.c
> ===================================================================
> --- linux-2.6.26-rc3.orig/mm/filemap.c 2008-05-21 16:17:51.000000000 -0700
> +++ linux-2.6.26-rc3/mm/filemap.c 2008-05-21 16:17:58.000000000 -0700
> @@ -2581,9 +2581,8 @@ out:
> * Otherwise return zero.
> *
> * The @gfp_mask argument specifies whether I/O may be performed to release
> - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
> + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
> *
> - * NOTE: @gfp_mask may go away, and this function may become non-blocking.
Yup, that note is dead.
> */
> int try_to_release_page(struct page *page, gfp_t gfp_mask)
> {
>
> On Wed, 2008-05-21 at 01:53 +0200, Jan Kara wrote:
> > > fs/jbd/transaction.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > > mm/filemap.c | 3 --
> > > 2 files changed, 54 insertions(+), 4 deletions(-)
> > >
> > > Index: linux-2.6.26-rc2/fs/jbd/transaction.c
> > > ===================================================================
> > > --- linux-2.6.26-rc2.orig/fs/jbd/transaction.c 2008-05-11 17:09:41.000000000 -0700
> > > +++ linux-2.6.26-rc2/fs/jbd/transaction.c 2008-05-19 16:16:41.000000000 -0700
> > > @@ -1648,12 +1648,39 @@ out:
> > > return;
> > > }
> > >
> > > +/*
> > > + * journal_try_to_free_buffers() could race with journal_commit_transaction()
> > > + * The later might still hold the reference count to the buffers when inspecting
> > > + * them on t_syncdata_list or t_locked_list.
> > > + *
> > > + * Journal_try_to_free_buffers() will call this function to
> > > + * wait for the current transaction to finish syncing data buffers, before
> > > + * try to free that buffer.
> > > + *
> > > + * Called with journal->j_state_lock hold.
> > > + */
> > > +static void journal_wait_for_transaction_sync_data(journal_t *journal)
> > > +{
> > > + transaction_t *transaction = NULL;
> > > + tid_t tid;
> > > +
> > > + transaction = journal->j_committing_transaction;
> > > +
> > > + if (!transaction)
> > > + return;
> > > +
> > > + tid = transaction->t_tid;
> > > + spin_unlock(&journal->j_state_lock);
> > > + log_wait_commit(journal, tid);
> > > + spin_lock(&journal->j_state_lock);
> > > +}
> > What is actually the point of entering the function with j_state_lock
> > held and also keeping it after return? It should be enough to take it
> > and release it just inside this function, shouldn't it?
> >
>
> I was worried about the case when we call try_to_free_buffers() again,
> it races with the current transaction commit again. Is it possible? I
> guess the question is whether it is possible to have buffers on the same
> page attached to different transaction. If so, I think we need to keep
> the journal state lock while retry try_to_free_buffers(), so that the
> retry won't race with the commit transaction again...
Well, but by the time log_wait_commit() finishes, it may well
happen that a new transaction is already started so your lock doesn't
help you much. And the page you are called on is actually locked, so
noone can really mess with it until you unlock it... So I think you can
just use the lock for obtaining tid and then drop it.
Honza
PS: For JBD2 you'd need to be a bit more careful because you cannot call
log_wait_commit() while holding page lock (we have reversed locking
order for ext4) - but ordered-mode rewrite patch actually fixes this
problem and I'm going to submit the splitted patches on Monday or
Tuesday (I only need to test them that I didn't do something stupid
while porting them to ext4)...
--
Jan Kara <[email protected]>
SuSE CR Labs
On Sun, 2008-05-25 at 00:44 +0200, Jan Kara wrote:
> > On Wed, 2008-05-21 at 01:53 +0200, Jan Kara wrote:
> > > > fs/jbd/transaction.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > > > mm/filemap.c | 3 --
> > > > 2 files changed, 54 insertions(+), 4 deletions(-)
> > > >
> > > > Index: linux-2.6.26-rc2/fs/jbd/transaction.c
> > > > ===================================================================
> > > > --- linux-2.6.26-rc2.orig/fs/jbd/transaction.c 2008-05-11 17:09:41.000000000 -0700
> > > > +++ linux-2.6.26-rc2/fs/jbd/transaction.c 2008-05-19 16:16:41.000000000 -0700
> > > > @@ -1648,12 +1648,39 @@ out:
> > > > return;
> > > > }
> > > >
> > > > +/*
> > > > + * journal_try_to_free_buffers() could race with journal_commit_transaction()
> > > > + * The later might still hold the reference count to the buffers when inspecting
> > > > + * them on t_syncdata_list or t_locked_list.
> > > > + *
> > > > + * Journal_try_to_free_buffers() will call this function to
> > > > + * wait for the current transaction to finish syncing data buffers, before
> > > > + * try to free that buffer.
> > > > + *
> > > > + * Called with journal->j_state_lock hold.
> > > > + */
> > > > +static void journal_wait_for_transaction_sync_data(journal_t *journal)
> > > > +{
> > > > + transaction_t *transaction = NULL;
> > > > + tid_t tid;
> > > > +
> > > > + transaction = journal->j_committing_transaction;
> > > > +
> > > > + if (!transaction)
> > > > + return;
> > > > +
> > > > + tid = transaction->t_tid;
> > > > + spin_unlock(&journal->j_state_lock);
> > > > + log_wait_commit(journal, tid);
> > > > + spin_lock(&journal->j_state_lock);
> > > > +}
> > > What is actually the point of entering the function with j_state_lock
> > > held and also keeping it after return? It should be enough to take it
> > > and release it just inside this function, shouldn't it?
> > >
> >
> > I was worried about the case when we call try_to_free_buffers() again,
> > it races with the current transaction commit again. Is it possible? I
> > guess the question is whether it is possible to have buffers on the same
> > page attached to different transaction. If so, I think we need to keep
> > the journal state lock while retry try_to_free_buffers(), so that the
> > retry won't race with the commit transaction again...
> Well, but by the time log_wait_commit() finishes, it may well
> happen that a new transaction is already started so your lock doesn't
> help you much. And the page you are called on is actually locked, so
> noone can really mess with it until you unlock it... So I think you can
> just use the lock for obtaining tid and then drop it.
>
You are right that the page was locked during the process we are trying
to free the buffer. so I agree it's safe to drop the lock.
> Honza
>
> PS: For JBD2 you'd need to be a bit more careful because you cannot call
> log_wait_commit() while holding page lock (we have reversed locking
> order for ext4) - but ordered-mode rewrite patch actually fixes this
> problem and I'm going to submit the splitted patches on Monday or
> Tuesday (I only need to test them that I didn't do something stupid
> while porting them to ext4)...
>
Thanks for pointing this out. I think when we put back the reversed
locking order and new ordered mode the jbd2 patch could go away...
Updated patch for JBD (take 4) below.
Mingming
JBD: fix race between journal_try_to_free_buffers() and jbd commit transaction
From: Mingming Cao <[email protected]>
journal_try_to_free_buffers() could race with jbd commit transaction when
the later is holding the buffer reference while waiting for the data buffer
to flush to disk. If the caller of journal_try_to_free_buffers() request
tries hard to release the buffers, it will treat the failure as error and return
back to the caller. We have seen the directo IO failed due to this race.
Some of the caller of releasepage() also expecting the buffer to be dropped
when passed with GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers().
With this patch, if the caller is passing the GFP_KERNEL to indicating this
call could wait, in case of try_to_free_buffers() failed, let's waiting for
journal_commit_transaction() to finish commit the current committing transaction
, then try to free those buffers again with journal locked.
Signed-off-by: Mingming Cao <[email protected]>
Reviewed-by: Badari Pulavarty <[email protected]>
---
fs/jbd/transaction.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++--
mm/filemap.c | 3 --
2 files changed, 56 insertions(+), 4 deletions(-)
Index: linux-2.6.26-rc3/fs/jbd/transaction.c
===================================================================
--- linux-2.6.26-rc3.orig/fs/jbd/transaction.c 2008-05-28 10:55:37.000000000 -0700
+++ linux-2.6.26-rc3/fs/jbd/transaction.c 2008-05-28 10:57:32.000000000 -0700
@@ -1648,12 +1648,42 @@ out:
return;
}
+/*
+ * journal_try_to_free_buffers() could race with journal_commit_transaction()
+ * The later might still hold the reference count to the buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * Journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction) {
+ spin_unlock(&journal->j_state_lock);
+ return;
+ }
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ log_wait_commit(journal, tid);
+}
/**
* int journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1682,9 +1712,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1713,7 +1745,28 @@ int journal_try_to_free_buffers(journal_
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where journal_try_to_free_buffers()
+ * could race with journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && (gfp_mask & GFP_KERNEL == GFP_KERNEL)) {
+ journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ }
+
busy:
return ret;
}
Index: linux-2.6.26-rc3/mm/filemap.c
===================================================================
--- linux-2.6.26-rc3.orig/mm/filemap.c 2008-05-28 10:55:38.000000000 -0700
+++ linux-2.6.26-rc3/mm/filemap.c 2008-05-28 10:55:43.000000000 -0700
@@ -2581,9 +2581,8 @@ out:
* Otherwise return zero.
*
* The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
*
- * NOTE: @gfp_mask may go away, and this function may become non-blocking.
*/
int try_to_release_page(struct page *page, gfp_t gfp_mask)
{
On Wed 28-05-08 11:18:59, Mingming Cao wrote:
> On Sun, 2008-05-25 at 00:44 +0200, Jan Kara wrote:
> > > On Wed, 2008-05-21 at 01:53 +0200, Jan Kara wrote:
> > > > > fs/jbd/transaction.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > > > > mm/filemap.c | 3 --
> > > > > 2 files changed, 54 insertions(+), 4 deletions(-)
> > > > >
> > > > > Index: linux-2.6.26-rc2/fs/jbd/transaction.c
> > > > > ===================================================================
> > > > > --- linux-2.6.26-rc2.orig/fs/jbd/transaction.c 2008-05-11 17:09:41.000000000 -0700
> > > > > +++ linux-2.6.26-rc2/fs/jbd/transaction.c 2008-05-19 16:16:41.000000000 -0700
> > > > > @@ -1648,12 +1648,39 @@ out:
> > > > > return;
> > > > > }
> > > > >
> > > > > +/*
> > > > > + * journal_try_to_free_buffers() could race with journal_commit_transaction()
> > > > > + * The later might still hold the reference count to the buffers when inspecting
> > > > > + * them on t_syncdata_list or t_locked_list.
> > > > > + *
> > > > > + * Journal_try_to_free_buffers() will call this function to
> > > > > + * wait for the current transaction to finish syncing data buffers, before
> > > > > + * try to free that buffer.
> > > > > + *
> > > > > + * Called with journal->j_state_lock hold.
> > > > > + */
> > > > > +static void journal_wait_for_transaction_sync_data(journal_t *journal)
> > > > > +{
> > > > > + transaction_t *transaction = NULL;
> > > > > + tid_t tid;
> > > > > +
> > > > > + transaction = journal->j_committing_transaction;
> > > > > +
> > > > > + if (!transaction)
> > > > > + return;
> > > > > +
> > > > > + tid = transaction->t_tid;
> > > > > + spin_unlock(&journal->j_state_lock);
> > > > > + log_wait_commit(journal, tid);
> > > > > + spin_lock(&journal->j_state_lock);
> > > > > +}
> > > > What is actually the point of entering the function with j_state_lock
> > > > held and also keeping it after return? It should be enough to take it
> > > > and release it just inside this function, shouldn't it?
> > > >
> > >
> > > I was worried about the case when we call try_to_free_buffers() again,
> > > it races with the current transaction commit again. Is it possible? I
> > > guess the question is whether it is possible to have buffers on the same
> > > page attached to different transaction. If so, I think we need to keep
> > > the journal state lock while retry try_to_free_buffers(), so that the
> > > retry won't race with the commit transaction again...
> > Well, but by the time log_wait_commit() finishes, it may well
> > happen that a new transaction is already started so your lock doesn't
> > help you much. And the page you are called on is actually locked, so
> > noone can really mess with it until you unlock it... So I think you can
> > just use the lock for obtaining tid and then drop it.
> >
>
> You are right that the page was locked during the process we are trying
> to free the buffer. so I agree it's safe to drop the lock.
>
> > Honza
> >
> > PS: For JBD2 you'd need to be a bit more careful because you cannot call
> > log_wait_commit() while holding page lock (we have reversed locking
> > order for ext4) - but ordered-mode rewrite patch actually fixes this
> > problem and I'm going to submit the splitted patches on Monday or
> > Tuesday (I only need to test them that I didn't do something stupid
> > while porting them to ext4)...
> >
> Thanks for pointing this out. I think when we put back the reversed
> locking order and new ordered mode the jbd2 patch could go away...
>
> Updated patch for JBD (take 4) below.
> Mingming
>
> JBD: fix race between journal_try_to_free_buffers() and jbd commit transaction
>
> From: Mingming Cao <[email protected]>
>
> journal_try_to_free_buffers() could race with jbd commit transaction when
> the later is holding the buffer reference while waiting for the data buffer
> to flush to disk. If the caller of journal_try_to_free_buffers() request
> tries hard to release the buffers, it will treat the failure as error and return
> back to the caller. We have seen the directo IO failed due to this race.
> Some of the caller of releasepage() also expecting the buffer to be dropped
> when passed with GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers().
>
> With this patch, if the caller is passing the GFP_KERNEL to indicating this
> call could wait, in case of try_to_free_buffers() failed, let's waiting for
> journal_commit_transaction() to finish commit the current committing transaction
> , then try to free those buffers again with journal locked.
>
> Signed-off-by: Mingming Cao <[email protected]>
> Reviewed-by: Badari Pulavarty <[email protected]>
> ---
> fs/jbd/transaction.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++--
> mm/filemap.c | 3 --
> 2 files changed, 56 insertions(+), 4 deletions(-)
>
> Index: linux-2.6.26-rc3/fs/jbd/transaction.c
> ===================================================================
> --- linux-2.6.26-rc3.orig/fs/jbd/transaction.c 2008-05-28 10:55:37.000000000 -0700
> +++ linux-2.6.26-rc3/fs/jbd/transaction.c 2008-05-28 10:57:32.000000000 -0700
> @@ -1648,12 +1648,42 @@ out:
> return;
> }
>
> +/*
> + * journal_try_to_free_buffers() could race with journal_commit_transaction()
> + * The later might still hold the reference count to the buffers when inspecting
> + * them on t_syncdata_list or t_locked_list.
> + *
> + * Journal_try_to_free_buffers() will call this function to
> + * wait for the current transaction to finish syncing data buffers, before
> + * try to free that buffer.
> + *
> + * Called with journal->j_state_lock hold.
> + */
> +static void journal_wait_for_transaction_sync_data(journal_t *journal)
> +{
> + transaction_t *transaction = NULL;
> + tid_t tid;
> +
> + spin_lock(&journal->j_state_lock);
> + transaction = journal->j_committing_transaction;
> +
> + if (!transaction) {
> + spin_unlock(&journal->j_state_lock);
> + return;
> + }
> +
> + tid = transaction->t_tid;
> + spin_unlock(&journal->j_state_lock);
> + log_wait_commit(journal, tid);
> +}
>
> /**
> * int journal_try_to_free_buffers() - try to free page buffers.
> * @journal: journal for operation
> * @page: to try and free
> - * @unused_gfp_mask: unused
> + * @gfp_mask: we use the mask to detect how hard should we try to release
> + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
> + * release the buffers.
> *
> *
> * For all the buffers on this page,
> @@ -1682,9 +1712,11 @@ out:
> * journal_try_to_free_buffer() is changing its state. But that
> * cannot happen because we never reallocate freed data as metadata
> * while the data is part of a transaction. Yes?
> + *
> + * Return 0 on failure, 1 on success
> */
> int journal_try_to_free_buffers(journal_t *journal,
> - struct page *page, gfp_t unused_gfp_mask)
> + struct page *page, gfp_t gfp_mask)
> {
> struct buffer_head *head;
> struct buffer_head *bh;
> @@ -1713,7 +1745,28 @@ int journal_try_to_free_buffers(journal_
> if (buffer_jbd(bh))
> goto busy;
> } while ((bh = bh->b_this_page) != head);
> +
> ret = try_to_free_buffers(page);
> +
> + /*
> + * There are a number of places where journal_try_to_free_buffers()
> + * could race with journal_commit_transaction(), the later still
> + * holds the reference to the buffers to free while processing them.
> + * try_to_free_buffers() failed to free those buffers. Some of the
> + * caller of releasepage() request page buffers to be dropped, otherwise
> + * treat the fail-to-free as errors (such as generic_file_direct_IO())
> + *
> + * So, if the caller of try_to_release_page() wants the synchronous
> + * behaviour(i.e make sure buffers are dropped upon return),
> + * let's wait for the current transaction to finish flush of
> + * dirty data buffers, then try to free those buffers again,
> + * with the journal locked.
> + */
> + if (ret == 0 && (gfp_mask & GFP_KERNEL == GFP_KERNEL)) {
I think Andrew prefered this test to be expanded but otherwise the patch
is fine now. You can add:
Acked-by: Jan Kara <[email protected]>
Thanks for solving this :)
> + journal_wait_for_transaction_sync_data(journal);
> + ret = try_to_free_buffers(page);
> + }
> +
> busy:
> return ret;
> }
> Index: linux-2.6.26-rc3/mm/filemap.c
> ===================================================================
> --- linux-2.6.26-rc3.orig/mm/filemap.c 2008-05-28 10:55:38.000000000 -0700
> +++ linux-2.6.26-rc3/mm/filemap.c 2008-05-28 10:55:43.000000000 -0700
> @@ -2581,9 +2581,8 @@ out:
> * Otherwise return zero.
> *
> * The @gfp_mask argument specifies whether I/O may be performed to release
> - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
> + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
> *
> - * NOTE: @gfp_mask may go away, and this function may become non-blocking.
> */
> int try_to_release_page(struct page *page, gfp_t gfp_mask)
> {
>
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed, 2008-05-28 at 20:55 +0200, Jan Kara wrote:
> On Wed 28-05-08 11:18:59, Mingming Cao wrote:
> > @@ -1682,9 +1712,11 @@ out:
> > * journal_try_to_free_buffer() is changing its state. But that
> > * cannot happen because we never reallocate freed data as metadata
> > * while the data is part of a transaction. Yes?
> > + *
> > + * Return 0 on failure, 1 on success
> > */
> > int journal_try_to_free_buffers(journal_t *journal,
> > - struct page *page, gfp_t unused_gfp_mask)
> > + struct page *page, gfp_t gfp_mask)
> > {
> > struct buffer_head *head;
> > struct buffer_head *bh;
> > @@ -1713,7 +1745,28 @@ int journal_try_to_free_buffers(journal_
> > if (buffer_jbd(bh))
> > goto busy;
> > } while ((bh = bh->b_this_page) != head);
> > +
> > ret = try_to_free_buffers(page);
> > +
> > + /*
> > + * There are a number of places where journal_try_to_free_buffers()
> > + * could race with journal_commit_transaction(), the later still
> > + * holds the reference to the buffers to free while processing them.
> > + * try_to_free_buffers() failed to free those buffers. Some of the
> > + * caller of releasepage() request page buffers to be dropped, otherwise
> > + * treat the fail-to-free as errors (such as generic_file_direct_IO())
> > + *
> > + * So, if the caller of try_to_release_page() wants the synchronous
> > + * behaviour(i.e make sure buffers are dropped upon return),
> > + * let's wait for the current transaction to finish flush of
> > + * dirty data buffers, then try to free those buffers again,
> > + * with the journal locked.
> > + */
> > + if (ret == 0 && (gfp_mask & GFP_KERNEL == GFP_KERNEL)) {
> I think Andrew prefered this test to be expanded but otherwise the patch
> is fine now. You can add:
> Acked-by: Jan Kara <[email protected]>
>
Okay, I will update the patch and cleanup the history, sent it to
Andrew. Thanks.
Mingming
Updated patch after Jan acked.
journal_try_to_free_buffers() could race with jbd commit transaction when
the later is holding the buffer reference while waiting for the data buffer
to flush to disk. If the caller of journal_try_to_free_buffers() request
tries hard to release the buffers, it will treat the failure as error and return
back to the caller. We have seen the directo IO failed due to this race.
Some of the caller of releasepage() also expecting the buffer to be dropped
when passed with GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers().
With this patch, if the caller is passing the __GFP_WAIT and __GFP_FS
to indicating this call could wait, in case of try_to_free_buffers() failed,
let's waiting for journal_commit_transaction() to finish commit the
current committing transaction, then try to free those buffers again.
Signed-off-by: Mingming Cao <[email protected]>
Reviewed-by: Badari Pulavarty <[email protected]>
Acked-by: Jan Kara <[email protected]>
---
fs/jbd/transaction.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++--
mm/filemap.c | 3 --
2 files changed, 56 insertions(+), 4 deletions(-)
Index: linux-2.6.26-rc3/fs/jbd/transaction.c
===================================================================
--- linux-2.6.26-rc3.orig/fs/jbd/transaction.c 2008-05-28 14:16:41.000000000 -0700
+++ linux-2.6.26-rc3/fs/jbd/transaction.c 2008-05-28 16:12:15.000000000 -0700
@@ -1648,12 +1648,42 @@ out:
return;
}
+/*
+ * journal_try_to_free_buffers() could race with journal_commit_transaction()
+ * The later might still hold the reference count to the buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * Journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction) {
+ spin_unlock(&journal->j_state_lock);
+ return;
+ }
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ log_wait_commit(journal, tid);
+}
/**
* int journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1682,9 +1712,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1713,7 +1745,28 @@ int journal_try_to_free_buffers(journal_
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where journal_try_to_free_buffers()
+ * could race with journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+ journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ }
+
busy:
return ret;
}
Index: linux-2.6.26-rc3/mm/filemap.c
===================================================================
--- linux-2.6.26-rc3.orig/mm/filemap.c 2008-05-28 14:16:41.000000000 -0700
+++ linux-2.6.26-rc3/mm/filemap.c 2008-05-28 14:17:17.000000000 -0700
@@ -2581,9 +2581,8 @@ out:
* Otherwise return zero.
*
* The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
*
- * NOTE: @gfp_mask may go away, and this function may become non-blocking.
*/
int try_to_release_page(struct page *page, gfp_t gfp_mask)
{
journal_try_to_free_buffers() could race with jbd commit transaction when
the later is holding the buffer reference while waiting for the data buffer
to flush to disk. If the caller of journal_try_to_free_buffers() request
tries hard to release the buffers, it will treat the failure as error and return
back to the caller. We have seen the directo IO failed due to this race.
Some of the caller of releasepage() also expecting the buffer to be dropped
when passed with GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers().
With this patch, if the caller is passing the __GFP_WAIT and __GFP_FS
to indicating this call could wait, in case of try_to_free_buffers() failed,
let's waiting for journal_commit_transaction() to finish commit the
current committing transaction, then try to free those buffers again.
Signed-off-by: Mingming Cao <[email protected]>
Reviewed-by: Badari Pulavarty <[email protected]>
Acked-by: Jan Kara <[email protected]>
---
fs/jbd2/transaction.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 57 insertions(+), 2 deletions(-)
Index: linux-2.6.26-rc3/fs/jbd2/transaction.c
===================================================================
--- linux-2.6.26-rc3.orig/fs/jbd2/transaction.c 2008-05-28 16:10:41.000000000 -0700
+++ linux-2.6.26-rc3/fs/jbd2/transaction.c 2008-05-28 16:13:16.000000000 -0700
@@ -1656,12 +1656,42 @@ out:
return;
}
+/*
+ * jbd2_journal_try_to_free_buffers() could race with jbd2_journal_commit_transaction()
+ * The later might still hold the reference count to the buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction) {
+ spin_unlock(&journal->j_state_lock);
+ return;
+ }
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+}
/**
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1690,9 +1720,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1721,7 +1753,30 @@ int jbd2_journal_try_to_free_buffers(jou
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where jbd2_journal_try_to_free_buffers()
+ * could race with jbd2_journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+ spin_lock(&journal->j_state_lock);
+ jbd2_journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ spin_unlock(&journal->j_state_lock);
+ }
+
busy:
return ret;
}
On Wed, May 28, 2008 at 05:18:19PM -0700, Mingming Cao wrote:
> Index: linux-2.6.26-rc3/fs/jbd2/transaction.c
> ===================================================================
> --- linux-2.6.26-rc3.orig/fs/jbd2/transaction.c 2008-05-28 16:10:41.000000000 -0700
> +++ linux-2.6.26-rc3/fs/jbd2/transaction.c 2008-05-28 16:13:16.000000000 -0700
> @@ -1656,12 +1656,42 @@ out:
> return;
> }
>
> +/*
> + * jbd2_journal_try_to_free_buffers() could race with jbd2_journal_commit_transaction()
> + * The later might still hold the reference count to the buffers when inspecting
> + * them on t_syncdata_list or t_locked_list.
> + *
> + * jbd2_journal_try_to_free_buffers() will call this function to
> + * wait for the current transaction to finish syncing data buffers, before
> + * try to free that buffer.
> + *
> + * Called with journal->j_state_lock hold.
> + */
We are taking the spin_lock again in the function ??
> +static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
> +{
> + transaction_t *transaction = NULL;
> + tid_t tid;
> +
> + spin_lock(&journal->j_state_lock);
> + transaction = journal->j_committing_transaction;
> +
> + if (!transaction) {
> + spin_unlock(&journal->j_state_lock);
> + return;
> + }
> +
> + tid = transaction->t_tid;
> + spin_unlock(&journal->j_state_lock);
> + jbd2_log_wait_commit(journal, tid);
> +}
[.... snip.... ]
> + if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
> + spin_lock(&journal->j_state_lock);
> + jbd2_journal_wait_for_transaction_sync_data(journal);
> + ret = try_to_free_buffers(page);
> + spin_unlock(&journal->j_state_lock);
> + }
> +
> busy:
> return ret;
> }
>
>
-aneesh
On Fri, 2008-05-30 at 11:54 +0530, Aneesh Kumar K.V wrote:
> On Wed, May 28, 2008 at 05:18:19PM -0700, Mingming Cao wrote:
> > Index: linux-2.6.26-rc3/fs/jbd2/transaction.c
> > ===================================================================
> > --- linux-2.6.26-rc3.orig/fs/jbd2/transaction.c 2008-05-28 16:10:41.000000000 -0700
> > +++ linux-2.6.26-rc3/fs/jbd2/transaction.c 2008-05-28 16:13:16.000000000 -0700
> > @@ -1656,12 +1656,42 @@ out:
> > return;
> > }
> >
> > +/*
> > + * jbd2_journal_try_to_free_buffers() could race with jbd2_journal_commit_transaction()
> > + * The later might still hold the reference count to the buffers when inspecting
> > + * them on t_syncdata_list or t_locked_list.
> > + *
> > + * jbd2_journal_try_to_free_buffers() will call this function to
> > + * wait for the current transaction to finish syncing data buffers, before
> > + * try to free that buffer.
> > + *
> > + * Called with journal->j_state_lock hold.
> > + */
>
> We are taking the spin_lock again in the function ??
>
Thanks. I noticed this yesterday and have sent Andrew updated patch to
replace the one he just added to mm tree but forget to copy to the list. Here is the updated patch,
Mingming
---
fs/jbd2/transaction.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 58 insertions(+), 3 deletions(-)
Index: linux-2.6.26-rc4/fs/jbd2/transaction.c
===================================================================
--- linux-2.6.26-rc4.orig/fs/jbd2/transaction.c 2008-05-29 12:21:40.000000000 -0700
+++ linux-2.6.26-rc4/fs/jbd2/transaction.c 2008-05-29 12:38:30.000000000 -0700
@@ -1656,12 +1656,43 @@ out:
return;
}
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction) {
+ spin_unlock(&journal->j_state_lock);
+ return;
+ }
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+}
/**
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1690,9 +1721,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1708,7 +1741,8 @@ int jbd2_journal_try_to_free_buffers(jou
/*
* We take our own ref against the journal_head here to avoid
* having to add tons of locking around each instance of
- * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+ * jbd2_journal_remove_journal_head() and
+ * jbd2_journal_put_journal_head().
*/
jh = jbd2_journal_grab_journal_head(bh);
if (!jh)
@@ -1721,7 +1755,28 @@ int jbd2_journal_try_to_free_buffers(jou
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where jbd2_journal_try_to_free_buffers()
+ * could race with jbd2_journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+ jbd2_journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ }
+
busy:
return ret;
}