From: Bobi Jam Subject: [PATCH v2] ext4: expand commit callback Date: Wed, 09 Nov 2011 17:49:53 +0800 Message-ID: <4EBA4CC1.5020605@whamcloud.com> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit Cc: linux-ext4@vger.kernel.org To: Theodore Tso , Andreas Dilger Return-path: Received: from mail-yw0-f46.google.com ([209.85.213.46]:37805 "EHLO mail-yw0-f46.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752188Ab1KIJuI (ORCPT ); Wed, 9 Nov 2011 04:50:08 -0500 Received: by ywf7 with SMTP id 7so1532138ywf.19 for ; Wed, 09 Nov 2011 01:50:08 -0800 (PST) Sender: linux-ext4-owner@vger.kernel.org List-ID: Hi, This patch has been rebased on v3.2-rc1-91. The per-commit callback was used by mballoc code to manage free space bitmaps after deleted blocks have been released. This patch expand it to contain multiple different callbacks. * add a ext4_journal_cb_entry to store journal callback function. * add an ext4_journal_cb_entry member in ext4_free_data. * transaction->t_private_list is used to link all journal callback entries. * add ext4_journal_callback_add/del functions to add/deletel callback entry into/from jounal->j_private. Signed-off-by: Bobi Jam Signed-off-by: Andreas Dilger --- fs/ext4/ext4_jbd2.h | 72 ++++++++++++++++++++++++ fs/ext4/mballoc.c | 151 ++++++++++++++++++++++++--------------------------- fs/ext4/mballoc.h | 18 ++++--- fs/ext4/super.c | 18 ++++++ 4 files changed, 171 insertions(+), 88 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 5802fa1..c65a774 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -104,6 +104,78 @@ #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) +/** + * struct ext4_journal_cb_entry - Base structure for callback information. + * + * This struct is a 'seed' structure for a using with your own callback + * structs. If you are using callbacks you must allocate one of these + * or another struct of your own definition which has this struct + * as it's first element and pass it to ext4_journal_callback_add(). + */ +struct ext4_journal_cb_entry { + /* list information for other callbacks attached to the same handle */ + struct list_head jce_list; + + /* Function to call with this callback structure */ + void (*jce_func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int error); + + /* user data goes here */ +}; + +/** + * ext4_journal_callback_add: add a function to call after transaction commit + * @handle: active journal transaction handle to register callback on + * @func: callback function to call after the transaction has committed: + * @sb: superblock of current filesystem for transaction + * @jce: returned journal callback data + * @rc: journal state at commit (0 = transaction committed properly) + * @jce: journal callback data (internal and function private data struct) + * + * The registered function will be called in the context of the journal thread + * after the transaction for which the handle was created has completed. + * + * No locks are held when the callback function is called, so it is safe to + * call blocking functions from within the callback, but the callback should + * not block or run for too long, or the filesystem will be blocked waiting for + * the next transaction to commit. No journaling functions can be used, or + * there is a risk of deadlock. + * + * There is no guaranteed calling order of multiple registered callbacks on + * the same transaction. + */ +static inline void ext4_journal_callback_add(handle_t *handle, + void (*func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc), + struct ext4_journal_cb_entry *jce) +{ + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + /* Add the jce to transaction's private list */ + jce->jce_func = func; + spin_lock(&sbi->s_md_lock); + list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); +} + +/** + * ext4_journal_callback_del: delete a registered callback + * @handle: active journal transaction handle on which callback was registered + * @jce: registered journal callback entry to unregister + */ +static inline void ext4_journal_callback_del(handle_t *handle, + struct ext4_journal_cb_entry *jce) +{ + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + spin_lock(&sbi->s_md_lock); + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); +} + int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e2d8be8..fdf3f30 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -21,6 +21,7 @@ * mballoc.c contains the multiblocks allocation routines */ +#include "ext4_jbd2.h" #include "mballoc.h" #include #include @@ -339,7 +340,7 @@ */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; -static struct kmem_cache *ext4_free_ext_cachep; +static struct kmem_cache *ext4_free_data_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for @@ -357,7 +358,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int rc); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -2522,9 +2524,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); - if (sbi->s_journal) - sbi->s_journal->j_commit_callback = release_blocks_on_commit; - return 0; out_free_locality_groups: @@ -2637,58 +2636,55 @@ static inline int ext4_issue_discard(struct super_block *sb, * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. */ -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc) { - struct super_block *sb = journal->j_private; + struct ext4_free_data *entry = (struct ext4_free_data *)jce; struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0, count2 = 0; - struct ext4_free_data *entry; - struct list_head *l, *ltmp; - list_for_each_safe(l, ltmp, &txn->t_private_list) { - entry = list_entry(l, struct ext4_free_data, list); + mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + entry->efd_count, entry->efd_group, entry); - mb_debug(1, "gonna free %u blocks in group %u (0x%p):", - entry->count, entry->group, entry); + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, entry->efd_count); - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, entry->group, - entry->start_cluster, entry->count); + err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); - err = ext4_mb_load_buddy(sb, entry->group, &e4b); - /* we expect to find existing buddy because it's pinned */ - BUG_ON(err != 0); - db = e4b.bd_info; - /* there are blocks to put in buddy to make them really free */ - count += entry->count; - count2++; - ext4_lock_group(sb, entry->group); - /* Take it out of per group rb tree */ - rb_erase(&entry->node, &(db->bb_free_root)); - mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count); + db = e4b.bd_info; + /* there are blocks to put in buddy to make them really free */ + count += entry->efd_count; + count2++; + ext4_lock_group(sb, entry->efd_group); + /* Take it out of per group rb tree */ + rb_erase(&entry->efd_node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); - /* - * Clear the trimmed flag for the group so that the next - * ext4_trim_fs can trim it. - * If the volume is mounted with -o discard, online discard - * is supported and the free blocks will be trimmed online. - */ - if (!test_opt(sb, DISCARD)) - EXT4_MB_GRP_CLEAR_TRIMMED(db); + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); - if (!db->bb_free_root.rb_node) { - /* No more items in the per group rb tree - * balance refcounts from ext4_mb_free_metadata() - */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); - } - ext4_unlock_group(sb, entry->group); - kmem_cache_free(ext4_free_ext_cachep, entry); - ext4_mb_unload_buddy(&e4b); + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); } + ext4_unlock_group(sb, entry->efd_group); + kmem_cache_free(ext4_free_data_cachep, entry); + ext4_mb_unload_buddy(&e4b); mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } @@ -2741,9 +2737,9 @@ int __init ext4_init_mballoc(void) return -ENOMEM; } - ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, - SLAB_RECLAIM_ACCOUNT); - if (ext4_free_ext_cachep == NULL) { + ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, + SLAB_RECLAIM_ACCOUNT); + if (ext4_free_data_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); return -ENOMEM; @@ -2761,7 +2757,7 @@ void ext4_exit_mballoc(void) rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); - kmem_cache_destroy(ext4_free_ext_cachep); + kmem_cache_destroy(ext4_free_data_cachep); ext4_groupinfo_destroy_slabs(); ext4_remove_debugfs_entry(); } @@ -3321,8 +3317,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, n = rb_first(&(grp->bb_free_root)); while (n) { - entry = rb_entry(n, struct ext4_free_data, node); - ext4_set_bits(bitmap, entry->start_cluster, entry->count); + entry = rb_entry(n, struct ext4_free_data, efd_node); + ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); n = rb_next(n); } return; @@ -4428,9 +4424,9 @@ out: static int can_merge(struct ext4_free_data *entry1, struct ext4_free_data *entry2) { - if ((entry1->t_tid == entry2->t_tid) && - (entry1->group == entry2->group) && - ((entry1->start_cluster + entry1->count) == entry2->start_cluster)) + if ((entry1->efd_tid == entry2->efd_tid) && + (entry1->efd_group == entry2->efd_group) && + ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) return 1; return 0; } @@ -4452,8 +4448,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); - new_node = &new_entry->node; - cluster = new_entry->start_cluster; + new_node = &new_entry->efd_node; + cluster = new_entry->efd_start_cluster; if (!*n) { /* first free block exent. We need to @@ -4466,10 +4462,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } while (*n) { parent = *n; - entry = rb_entry(parent, struct ext4_free_data, node); - if (cluster < entry->start_cluster) + entry = rb_entry(parent, struct ext4_free_data, efd_node); + if (cluster < entry->efd_start_cluster) n = &(*n)->rb_left; - else if (cluster >= (entry->start_cluster + entry->count)) + else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, @@ -4486,34 +4482,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, /* Now try to see the extent can be merged to left and right */ node = rb_prev(new_node); if (node) { - entry = rb_entry(node, struct ext4_free_data, node); + entry = rb_entry(node, struct ext4_free_data, efd_node); if (can_merge(entry, new_entry)) { - new_entry->start_cluster = entry->start_cluster; - new_entry->count += entry->count; + new_entry->efd_start_cluster = entry->efd_start_cluster; + new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - spin_lock(&sbi->s_md_lock); - list_del(&entry->list); - spin_unlock(&sbi->s_md_lock); - kmem_cache_free(ext4_free_ext_cachep, entry); + ext4_journal_callback_del(handle, &entry->efd_jce); + kmem_cache_free(ext4_free_data_cachep, entry); } } node = rb_next(new_node); if (node) { - entry = rb_entry(node, struct ext4_free_data, node); + entry = rb_entry(node, struct ext4_free_data, efd_node); if (can_merge(new_entry, entry)) { - new_entry->count += entry->count; + new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - spin_lock(&sbi->s_md_lock); - list_del(&entry->list); - spin_unlock(&sbi->s_md_lock); - kmem_cache_free(ext4_free_ext_cachep, entry); + ext4_journal_callback_del(handle, &entry->efd_jce); + kmem_cache_free(ext4_free_data_cachep, entry); } } /* Add the extent to transaction's private list */ - spin_lock(&sbi->s_md_lock); - list_add(&new_entry->list, &handle->h_transaction->t_private_list); - spin_unlock(&sbi->s_md_lock); + ext4_journal_callback_add(handle, ext4_free_data_callback, + &new_entry->efd_jce); return 0; } @@ -4691,15 +4682,15 @@ do_more: * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed */ - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); if (!new_entry) { err = -ENOMEM; goto error_return; } - new_entry->start_cluster = bit; - new_entry->group = block_group; - new_entry->count = count_clusters; - new_entry->t_tid = handle->h_transaction->t_tid; + new_entry->efd_start_cluster = bit; + new_entry->efd_group = block_group; + new_entry->efd_count = count_clusters; + new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 47705f3..7128f46 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -96,21 +96,23 @@ extern u8 mb_enable_debug; struct ext4_free_data { - /* this links the free block information from group_info */ - struct rb_node node; + /* MUST be the first member */ + struct ext4_journal_cb_entry efd_jce; + + /* ext4_free_data private data starts from here */ - /* this links the free block information from ext4_sb_info */ - struct list_head list; + /* this links the free block information from group_info */ + struct rb_node efd_node; /* group which free block extent belongs */ - ext4_group_t group; + ext4_group_t efd_group; /* free block extent */ - ext4_grpblk_t start_cluster; - ext4_grpblk_t count; + ext4_grpblk_t efd_start_cluster; + ext4_grpblk_t efd_count; /* transaction which freed this extent */ - tid_t t_tid; + tid_t efd_tid; }; struct ext4_prealloc_space { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9953d80..f5da954 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -431,6 +431,22 @@ static int block_device_ejected(struct super_block *sb) return bdi->dev == NULL; } +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int error = is_journal_aborted(journal); + struct ext4_journal_cb_entry *jce, *tmp; + + spin_lock(&sbi->s_md_lock); + list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); + jce->jce_func(sb, jce, error); + spin_lock(&sbi->s_md_lock); + } + spin_unlock(&sbi->s_md_lock); +} /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. @@ -3698,6 +3714,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; + /* * The journal may have updated the bg summary counts, so we * need to update the global counters. -- 1.7.7