2011-11-09 11:35:41

by Bobi Jam

[permalink] [raw]
Subject: [PATCH v2] ext4: expand commit callback

The per-commit callback was used by mballoc code to manage free
space bitmaps after deleted blocks have been released. This patch
expand it to contain multiple different callbacks.

* add a ext4_journal_cb_entry to store journal callback function.
* add an ext4_journal_cb_entry member in ext4_free_data.
* transaction->t_private_list is used to link all journal callback
entries.
* add ext4_journal_callback_add/del functions to add/deletel
callback entry into/from jounal->j_private.

Signed-off-by: Bobi Jam <[email protected]>
Signed-off-by: Andreas Dilger <[email protected]>
---
fs/ext4/ext4_jbd2.h | 72 ++++++++++++++++++++++++
fs/ext4/mballoc.c | 151 ++++++++++++++++++++++++---------------------------
fs/ext4/mballoc.h | 18 ++++---
fs/ext4/super.c | 18 ++++++
4 files changed, 171 insertions(+), 88 deletions(-)

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5802fa1..c65a774 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -104,6 +104,78 @@
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))

+/**
+ * struct ext4_journal_cb_entry - Base structure for callback information.
+ *
+ * This struct is a 'seed' structure for a using with your own callback
+ * structs. If you are using callbacks you must allocate one of these
+ * or another struct of your own definition which has this struct
+ * as it's first element and pass it to ext4_journal_callback_add().
+ */
+struct ext4_journal_cb_entry {
+ /* list information for other callbacks attached to the same handle */
+ struct list_head jce_list;
+
+ /* Function to call with this callback structure */
+ void (*jce_func)(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce, int error);
+
+ /* user data goes here */
+};
+
+/**
+ * ext4_journal_callback_add: add a function to call after transaction commit
+ * @handle: active journal transaction handle to register callback on
+ * @func: callback function to call after the transaction has committed:
+ * @sb: superblock of current filesystem for transaction
+ * @jce: returned journal callback data
+ * @rc: journal state at commit (0 = transaction committed properly)
+ * @jce: journal callback data (internal and function private data struct)
+ *
+ * The registered function will be called in the context of the journal thread
+ * after the transaction for which the handle was created has completed.
+ *
+ * No locks are held when the callback function is called, so it is safe to
+ * call blocking functions from within the callback, but the callback should
+ * not block or run for too long, or the filesystem will be blocked waiting for
+ * the next transaction to commit. No journaling functions can be used, or
+ * there is a risk of deadlock.
+ *
+ * There is no guaranteed calling order of multiple registered callbacks on
+ * the same transaction.
+ */
+static inline void ext4_journal_callback_add(handle_t *handle,
+ void (*func)(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce,
+ int rc),
+ struct ext4_journal_cb_entry *jce)
+{
+ struct ext4_sb_info *sbi =
+ EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+ /* Add the jce to transaction's private list */
+ jce->jce_func = func;
+ spin_lock(&sbi->s_md_lock);
+ list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
+ spin_unlock(&sbi->s_md_lock);
+}
+
+/**
+ * ext4_journal_callback_del: delete a registered callback
+ * @handle: active journal transaction handle on which callback was registered
+ * @jce: registered journal callback entry to unregister
+ */
+static inline void ext4_journal_callback_del(handle_t *handle,
+ struct ext4_journal_cb_entry *jce)
+{
+ struct ext4_sb_info *sbi =
+ EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+ spin_lock(&sbi->s_md_lock);
+ list_del_init(&jce->jce_list);
+ spin_unlock(&sbi->s_md_lock);
+}
+
int
ext4_mark_iloc_dirty(handle_t *handle,
struct inode *inode,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e2d8be8..fdf3f30 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,6 +21,7 @@
* mballoc.c contains the multiblocks allocation routines
*/

+#include "ext4_jbd2.h"
#include "mballoc.h"
#include <linux/debugfs.h>
#include <linux/slab.h>
@@ -339,7 +340,7 @@
*/
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
+static struct kmem_cache *ext4_free_data_cachep;

/* We create slab caches for groupinfo data structures based on the
* superblock block size. There will be one per mounted filesystem for
@@ -357,7 +358,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce, int rc);

static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -2522,9 +2524,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_fops, sb);

- if (sbi->s_journal)
- sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-
return 0;

out_free_locality_groups:
@@ -2637,58 +2636,55 @@ static inline int ext4_issue_discard(struct super_block *sb,
* This function is called by the jbd2 layer once the commit has finished,
* so we know we can free the blocks that were released with that commit.
*/
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce,
+ int rc)
{
- struct super_block *sb = journal->j_private;
+ struct ext4_free_data *entry = (struct ext4_free_data *)jce;
struct ext4_buddy e4b;
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
- struct ext4_free_data *entry;
- struct list_head *l, *ltmp;

- list_for_each_safe(l, ltmp, &txn->t_private_list) {
- entry = list_entry(l, struct ext4_free_data, list);
+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+ entry->efd_count, entry->efd_group, entry);

- mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
- entry->count, entry->group, entry);
+ if (test_opt(sb, DISCARD))
+ ext4_issue_discard(sb, entry->efd_group,
+ entry->efd_start_cluster, entry->efd_count);

- if (test_opt(sb, DISCARD))
- ext4_issue_discard(sb, entry->group,
- entry->start_cluster, entry->count);
+ err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
+ /* we expect to find existing buddy because it's pinned */
+ BUG_ON(err != 0);

- err = ext4_mb_load_buddy(sb, entry->group, &e4b);
- /* we expect to find existing buddy because it's pinned */
- BUG_ON(err != 0);

- db = e4b.bd_info;
- /* there are blocks to put in buddy to make them really free */
- count += entry->count;
- count2++;
- ext4_lock_group(sb, entry->group);
- /* Take it out of per group rb tree */
- rb_erase(&entry->node, &(db->bb_free_root));
- mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count);
+ db = e4b.bd_info;
+ /* there are blocks to put in buddy to make them really free */
+ count += entry->efd_count;
+ count2++;
+ ext4_lock_group(sb, entry->efd_group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->efd_node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);

- /*
- * Clear the trimmed flag for the group so that the next
- * ext4_trim_fs can trim it.
- * If the volume is mounted with -o discard, online discard
- * is supported and the free blocks will be trimmed online.
- */
- if (!test_opt(sb, DISCARD))
- EXT4_MB_GRP_CLEAR_TRIMMED(db);
+ /*
+ * Clear the trimmed flag for the group so that the next
+ * ext4_trim_fs can trim it.
+ * If the volume is mounted with -o discard, online discard
+ * is supported and the free blocks will be trimmed online.
+ */
+ if (!test_opt(sb, DISCARD))
+ EXT4_MB_GRP_CLEAR_TRIMMED(db);

- if (!db->bb_free_root.rb_node) {
- /* No more items in the per group rb tree
- * balance refcounts from ext4_mb_free_metadata()
- */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
- }
- ext4_unlock_group(sb, entry->group);
- kmem_cache_free(ext4_free_ext_cachep, entry);
- ext4_mb_unload_buddy(&e4b);
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
}
+ ext4_unlock_group(sb, entry->efd_group);
+ kmem_cache_free(ext4_free_data_cachep, entry);
+ ext4_mb_unload_buddy(&e4b);

mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
}
@@ -2741,9 +2737,9 @@ int __init ext4_init_mballoc(void)
return -ENOMEM;
}

- ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
- SLAB_RECLAIM_ACCOUNT);
- if (ext4_free_ext_cachep == NULL) {
+ ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
+ SLAB_RECLAIM_ACCOUNT);
+ if (ext4_free_data_cachep == NULL) {
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
return -ENOMEM;
@@ -2761,7 +2757,7 @@ void ext4_exit_mballoc(void)
rcu_barrier();
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
- kmem_cache_destroy(ext4_free_ext_cachep);
+ kmem_cache_destroy(ext4_free_data_cachep);
ext4_groupinfo_destroy_slabs();
ext4_remove_debugfs_entry();
}
@@ -3321,8 +3317,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
n = rb_first(&(grp->bb_free_root));

while (n) {
- entry = rb_entry(n, struct ext4_free_data, node);
- ext4_set_bits(bitmap, entry->start_cluster, entry->count);
+ entry = rb_entry(n, struct ext4_free_data, efd_node);
+ ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
n = rb_next(n);
}
return;
@@ -4428,9 +4424,9 @@ out:
static int can_merge(struct ext4_free_data *entry1,
struct ext4_free_data *entry2)
{
- if ((entry1->t_tid == entry2->t_tid) &&
- (entry1->group == entry2->group) &&
- ((entry1->start_cluster + entry1->count) == entry2->start_cluster))
+ if ((entry1->efd_tid == entry2->efd_tid) &&
+ (entry1->efd_group == entry2->efd_group) &&
+ ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
return 1;
return 0;
}
@@ -4452,8 +4448,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);

- new_node = &new_entry->node;
- cluster = new_entry->start_cluster;
+ new_node = &new_entry->efd_node;
+ cluster = new_entry->efd_start_cluster;

if (!*n) {
/* first free block exent. We need to
@@ -4466,10 +4462,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
}
while (*n) {
parent = *n;
- entry = rb_entry(parent, struct ext4_free_data, node);
- if (cluster < entry->start_cluster)
+ entry = rb_entry(parent, struct ext4_free_data, efd_node);
+ if (cluster < entry->efd_start_cluster)
n = &(*n)->rb_left;
- else if (cluster >= (entry->start_cluster + entry->count))
+ else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
n = &(*n)->rb_right;
else {
ext4_grp_locked_error(sb, group, 0,
@@ -4486,34 +4482,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
/* Now try to see the extent can be merged to left and right */
node = rb_prev(new_node);
if (node) {
- entry = rb_entry(node, struct ext4_free_data, node);
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
if (can_merge(entry, new_entry)) {
- new_entry->start_cluster = entry->start_cluster;
- new_entry->count += entry->count;
+ new_entry->efd_start_cluster = entry->efd_start_cluster;
+ new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- spin_lock(&sbi->s_md_lock);
- list_del(&entry->list);
- spin_unlock(&sbi->s_md_lock);
- kmem_cache_free(ext4_free_ext_cachep, entry);
+ ext4_journal_callback_del(handle, &entry->efd_jce);
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
}

node = rb_next(new_node);
if (node) {
- entry = rb_entry(node, struct ext4_free_data, node);
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
if (can_merge(new_entry, entry)) {
- new_entry->count += entry->count;
+ new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- spin_lock(&sbi->s_md_lock);
- list_del(&entry->list);
- spin_unlock(&sbi->s_md_lock);
- kmem_cache_free(ext4_free_ext_cachep, entry);
+ ext4_journal_callback_del(handle, &entry->efd_jce);
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
}
/* Add the extent to transaction's private list */
- spin_lock(&sbi->s_md_lock);
- list_add(&new_entry->list, &handle->h_transaction->t_private_list);
- spin_unlock(&sbi->s_md_lock);
+ ext4_journal_callback_add(handle, ext4_free_data_callback,
+ &new_entry->efd_jce);
return 0;
}

@@ -4691,15 +4682,15 @@ do_more:
* blocks being freed are metadata. these blocks shouldn't
* be used until this transaction is committed
*/
- new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
if (!new_entry) {
err = -ENOMEM;
goto error_return;
}
- new_entry->start_cluster = bit;
- new_entry->group = block_group;
- new_entry->count = count_clusters;
- new_entry->t_tid = handle->h_transaction->t_tid;
+ new_entry->efd_start_cluster = bit;
+ new_entry->efd_group = block_group;
+ new_entry->efd_count = count_clusters;
+ new_entry->efd_tid = handle->h_transaction->t_tid;

ext4_lock_group(sb, block_group);
mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 47705f3..7128f46 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -96,21 +96,23 @@ extern u8 mb_enable_debug;


struct ext4_free_data {
- /* this links the free block information from group_info */
- struct rb_node node;
+ /* MUST be the first member */
+ struct ext4_journal_cb_entry efd_jce;
+
+ /* ext4_free_data private data starts from here */

- /* this links the free block information from ext4_sb_info */
- struct list_head list;
+ /* this links the free block information from group_info */
+ struct rb_node efd_node;

/* group which free block extent belongs */
- ext4_group_t group;
+ ext4_group_t efd_group;

/* free block extent */
- ext4_grpblk_t start_cluster;
- ext4_grpblk_t count;
+ ext4_grpblk_t efd_start_cluster;
+ ext4_grpblk_t efd_count;

/* transaction which freed this extent */
- tid_t t_tid;
+ tid_t efd_tid;
};

struct ext4_prealloc_space {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9953d80..f5da954 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -431,6 +431,22 @@ static int block_device_ejected(struct super_block *sb)
return bdi->dev == NULL;
}

+static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int error = is_journal_aborted(journal);
+ struct ext4_journal_cb_entry *jce, *tmp;
+
+ spin_lock(&sbi->s_md_lock);
+ list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
+ list_del_init(&jce->jce_list);
+ spin_unlock(&sbi->s_md_lock);
+ jce->jce_func(sb, jce, error);
+ spin_lock(&sbi->s_md_lock);
+ }
+ spin_unlock(&sbi->s_md_lock);
+}

/* Deal with the reporting of failure conditions on a filesystem such as
* inconsistencies detected or read IO failures.
@@ -3698,6 +3714,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);

+ sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
+
/*
* The journal may have updated the bg summary counts, so we
* need to update the global counters.
--
1.7.7