2008-10-17 01:42:46

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH, RFC] ext4: Replace hackish ext4_mb_poll_new_transaction with commit callback

The multiblock allocator needs to be able to release blocks (and issue
a blkdev discard request) when the transaction which freed those
blocks is committed. Previously this was done via a polling mechanism
when blocks are allocated or freed. A much better way of doing things
is to create a jbd2 callback function and attaching the list of blocks
to be freed directly to the transaction structure.

Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4_sb.h | 3 --
fs/ext4/mballoc.c | 85 +++++++++----------------------------------------
fs/ext4/mballoc.h | 3 +-
fs/jbd2/commit.c | 3 ++
fs/jbd2/transaction.c | 1 +
include/linux/jbd2.h | 9 +++++
6 files changed, 29 insertions(+), 75 deletions(-)

diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6a0b40d..445fde6 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -99,9 +99,6 @@ struct ext4_sb_info {
struct inode *s_buddy_cache;
long s_blocks_reserved;
spinlock_t s_reserve_lock;
- struct list_head s_active_transaction;
- struct list_head s_closed_transaction;
- struct list_head s_committed_transaction;
spinlock_t s_md_lock;
tid_t s_last_transaction;
unsigned short *s_mb_offsets, *s_mb_maxs;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index da1da1f..dfe17a1 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2523,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
}

spin_lock_init(&sbi->s_md_lock);
- INIT_LIST_HEAD(&sbi->s_active_transaction);
- INIT_LIST_HEAD(&sbi->s_closed_transaction);
- INIT_LIST_HEAD(&sbi->s_committed_transaction);
spin_lock_init(&sbi->s_bal_lock);

sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
@@ -2554,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
ext4_mb_init_per_dev_proc(sb);
ext4_mb_history_init(sb);

+ sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+
printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
return 0;
}
@@ -2583,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_group_info *grinfo;
struct ext4_sb_info *sbi = EXT4_SB(sb);

- /* release freed, non-committed blocks */
- spin_lock(&sbi->s_md_lock);
- list_splice_init(&sbi->s_closed_transaction,
- &sbi->s_committed_transaction);
- list_splice_init(&sbi->s_active_transaction,
- &sbi->s_committed_transaction);
- spin_unlock(&sbi->s_md_lock);
- ext4_mb_free_committed_blocks(sb);
-
if (sbi->s_group_info) {
for (i = 0; i < sbi->s_groups_count; i++) {
grinfo = ext4_get_group_info(sb, i);
@@ -2645,36 +2635,25 @@ int ext4_mb_release(struct super_block *sb)
return 0;
}

-static noinline_for_stack void
-ext4_mb_free_committed_blocks(struct super_block *sb)
+/*
+ * This function is called by the jbd2 layer once the commit has finished,
+ * so we know we can free the blocks that were released with that commit.
+ */
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
{
+ struct super_block *sb = journal->j_private;
struct ext4_buddy e4b;
struct ext4_group_info *db;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
int err, count = 0, count2 = 0;
struct ext4_free_data *entry;
ext4_fsblk_t discard_block;
+ struct list_head *l, *ltmp;

- if (list_empty(&sbi->s_committed_transaction))
- return;
-
- /* there is committed blocks to be freed yet */
- do {
- /* get next array of blocks */
- entry = NULL;
- spin_lock(&sbi->s_md_lock);
- if (!list_empty(&sbi->s_committed_transaction)) {
- entry = list_entry(sbi->s_committed_transaction.next,
- struct ext4_free_data, list);
- list_del(&entry->list);
- }
- spin_unlock(&sbi->s_md_lock);
-
- if (entry == NULL)
- break;
+ list_for_each_safe(l, ltmp, &txn->t_private_list) {
+ entry = list_entry(l, struct ext4_free_data, list);

mb_debug("gonna free %u blocks in group %lu (0x%p):",
- entry->count, entry->group, entry);
+ entry->count, entry->group, entry);

err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
@@ -2706,7 +2685,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)

kmem_cache_free(ext4_free_ext_cachep, entry);
ext4_mb_release_desc(&e4b);
- } while (1);
+ }

mb_debug("freed %u blocks in %u structures\n", count, count2);
}
@@ -4348,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
goto out1;
}

- ext4_mb_poll_new_transaction(sb, handle);
-
*errp = ext4_mb_initialize_context(ac, ar);
if (*errp) {
ar->len = 0;
@@ -4408,36 +4385,6 @@ out1:

return block;
}
-static void ext4_mb_poll_new_transaction(struct super_block *sb,
- handle_t *handle)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (sbi->s_last_transaction == handle->h_transaction->t_tid)
- return;
-
- /* new transaction! time to close last one and free blocks for
- * committed transaction. we know that only transaction can be
- * active, so previos transaction can be being logged and we
- * know that transaction before previous is known to be already
- * logged. this means that now we may free blocks freed in all
- * transactions before previous one. hope I'm clear enough ... */
-
- spin_lock(&sbi->s_md_lock);
- if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
- mb_debug("new transaction %lu, old %lu\n",
- (unsigned long) handle->h_transaction->t_tid,
- (unsigned long) sbi->s_last_transaction);
- list_splice_init(&sbi->s_closed_transaction,
- &sbi->s_committed_transaction);
- list_splice_init(&sbi->s_active_transaction,
- &sbi->s_closed_transaction);
- sbi->s_last_transaction = handle->h_transaction->t_tid;
- }
- spin_unlock(&sbi->s_md_lock);
-
- ext4_mb_free_committed_blocks(sb);
-}

/*
* We can merge two free data extents only if the physical blocks
@@ -4531,9 +4478,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
kmem_cache_free(ext4_free_ext_cachep, entry);
}
}
- /* Add the extent to active_transaction list */
+ /* Add the extent to transaction's private list */
spin_lock(&sbi->s_md_lock);
- list_add(&new_entry->list, &sbi->s_active_transaction);
+ list_add(&new_entry->list, &handle->h_transaction->t_private_list);
spin_unlock(&sbi->s_md_lock);
ext4_unlock_group(sb, group);
return 0;
@@ -4562,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,

*freed = 0;

- ext4_mb_poll_new_transaction(sb, handle);
-
sbi = EXT4_SB(sb);
es = EXT4_SB(sb)->s_es;
if (block < le32_to_cpu(es->s_first_data_block) ||
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 94cb7b9..b5dff1f 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -269,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);

static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
-static void ext4_mb_free_committed_blocks(struct super_block *);
static void ext4_mb_return_to_preallocation(struct inode *inode,
struct ext4_buddy *e4b, sector_t block,
int count);
@@ -278,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *,
struct super_block *, struct ext4_prealloc_space *pa);
static int ext4_mb_init_per_dev_proc(struct super_block *sb);
static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);


static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0abe02c..8b119e1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -995,6 +995,9 @@ restart_loop:
}
spin_unlock(&journal->j_list_lock);

+ if (journal->j_commit_callback)
+ journal->j_commit_callback(journal, commit_transaction);
+
trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
journal->j_devname, commit_transaction->t_tid,
journal->j_tail_sequence);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e5d5405..39b7805 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
INIT_LIST_HEAD(&transaction->t_inode_list);
+ INIT_LIST_HEAD(&transaction->t_private_list);

/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d2e91ea..3018583 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -641,6 +641,11 @@ struct transaction_s
*/
int t_handle_count;

+ /*
+ * For use by the filesystem to store fs-specific data
+ * structures associated with the transaction
+ */
+ struct list_head t_private_list;
};

struct transaction_run_stats_s {
@@ -935,6 +940,10 @@ struct journal_s

pid_t j_last_sync_writer;

+ /* This function is called when a transaction is closed */
+ void (*j_commit_callback)(journal_t *,
+ transaction_t *);
+
/*
* Journal statistics
*/
--
1.5.6.1.205.ge2c7.dirty


2008-10-17 06:04:41

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: Replace hackish ext4_mb_poll_new_transaction with commit callback

On Thu, Oct 16, 2008 at 08:02:43PM -0400, Theodore Ts'o wrote:
> The multiblock allocator needs to be able to release blocks (and issue
> a blkdev discard request) when the transaction which freed those
> blocks is committed. Previously this was done via a polling mechanism
> when blocks are allocated or freed. A much better way of doing things
> is to create a jbd2 callback function and attaching the list of blocks
> to be freed directly to the transaction structure.


Why not use journal commit callback patch from andreas
(MID:[email protected]
http://article.gmane.org/gmane.comp.file-systems.ext4/9148)

The patch you sent will allows only one call back to be registered.

-aneesh

2008-10-17 10:02:29

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: Replace hackish ext4_mb_poll_new_transaction with commit callback

On Fri, Oct 17, 2008 at 11:34:25AM +0530, Aneesh Kumar K.V wrote:
>
> Why not use journal commit callback patch from andreas
> (MID:[email protected]
> http://article.gmane.org/gmane.comp.file-systems.ext4/9148)
>
> The patch you sent will allows only one call back to be registered.
>

Thanks for reminding me about that; I had completely forgotten about
Andreas' patch. Sure, I'll respin the patch to use his extension.

- Ted

2008-10-17 12:26:10

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: Replace hackish ext4_mb_poll_new_transaction with commit callback

On Fri, Oct 17, 2008 at 06:02:14AM -0400, Theodore Tso wrote:
> >
> > Why not use journal commit callback patch from andreas
> > (MID:[email protected]
> > http://article.gmane.org/gmane.comp.file-systems.ext4/9148)
> >
> > The patch you sent will allows only one call back to be registered.
>
> Thanks for reminding me about that; I had completely forgotten about
> Andreas' patch. Sure, I'll respin the patch to use his extension.

I looked more closely at Andreas' patch, and it's really not a good
fit for what we want to do. The problem is that it is designed to
attach arbitrary callbacks on a per transaction basis. Each time you
add a callback you need to allocate a stucture, and then it gets
chained onto a inked list.

For what we need for mballoc, not only is it massive overkill, but
every single time we try to free blocks, we would have to and then
search the list to see the remove_committed_blocks() callback was or
wasn't on the list yet, and if it wasn't then allocate a chunk of
memory to hold the struct journal_callback data structure and call
journal_callback_set() function.

Furthermore, to get the locking right and to avoid race conditions,
we'd have to add a _journal_callback_set() function which doesn't take
the spinlock, and then take the spinlock ourselves, search the linked
list, allocate the memory, call _journal_callback_set(), and then
release the spinlock.

It was at about this point that I decided it Just Wasn't Worth It.

What I added was a dead-simple per-journal commit callback, with no
additional memory allocations (and requirement to do error handling if
the memory allocation fails), no need to take a spinlock before
manually adding the call back to each transaction handle, no need to
search the linked list to see if we have an entry on the linked list
already, etc.

If in the future we need a true per-transaction handle commit
callback, we can add this; but I think it still makes more sense to
keep the per-journal commit callback. After all, as it stands the
current patch results in a net reduction of 46 lines of code. Adding
all of this machinery would erase take far more than the savings by
removing ext4_mb_poll_new_transaction().

- Ted

2008-10-17 20:29:50

by Joel Becker

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: Replace hackish ext4_mb_poll_new_transaction with commit callback

On Fri, Oct 17, 2008 at 08:25:52AM -0400, Theodore Tso wrote:
> On Fri, Oct 17, 2008 at 06:02:14AM -0400, Theodore Tso wrote:
> > Thanks for reminding me about that; I had completely forgotten about
> > Andreas' patch. Sure, I'll respin the patch to use his extension.
>
> I looked more closely at Andreas' patch, and it's really not a good
> fit for what we want to do. The problem is that it is designed to
> attach arbitrary callbacks on a per transaction basis. Each time you
> add a callback you need to allocate a stucture, and then it gets
> chained onto a inked list.

Yeah, I looked at a similar scheme for the buffer commit
callbacks, and I realized that it would be a total pain. So I went with
a static callback.

> What I added was a dead-simple per-journal commit callback, with no
> additional memory allocations (and requirement to do error handling if
> the memory allocation fails), no need to take a spinlock before
> manually adding the call back to each transaction handle, no need to
> search the linked list to see if we have an entry on the linked list
> already, etc.

And this simple callback can used by a jbd2 client to build
the machinery for per-transaction callbacks if they want.

Joel

--

"We will have to repent in this generation not merely for the
vitriolic words and actions of the bad people, but for the
appalling silence of the good people."
- Rev. Dr. Martin Luther King, Jr.

Joel Becker
Principal Software Developer
Oracle
E-mail: [email protected]
Phone: (650) 506-8127

2008-10-19 22:49:52

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: Replace hackish ext4_mb_poll_new_transaction with commit callback

On Oct 17, 2008 08:25 -0400, Theodore Ts'o wrote:
> What I added was a dead-simple per-journal commit callback, with no
> additional memory allocations (and requirement to do error handling if
> the memory allocation fails), no need to take a spinlock before
> manually adding the call back to each transaction handle, no need to
> search the linked list to see if we have an entry on the linked list
> already, etc.
>
> If in the future we need a true per-transaction handle commit
> callback, we can add this; but I think it still makes more sense to
> keep the per-journal commit callback. After all, as it stands the
> current patch results in a net reduction of 46 lines of code. Adding
> all of this machinery would erase take far more than the savings by
> removing ext4_mb_poll_new_transaction().

The problem with the mechanism you've implemented is that it isn't
possible to add any other kind of callback to the journal. There
is only a single callback function, and the entries in the "t_private_list"
are all assumed to be "ext4_free_data" structures so even if other
users wanted to add callbacks they would only be handled by the
release_blocks_on_commit() function.

Is there any reason not to make this more generic and have the callback
function pointer embedded in the "ext4_free_data" struct in some way
that other callbacks could be registered? This would still avoid the
need to allocate for each of these operations, but would make the
callback mechanism more generic and useful.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2008-10-20 01:13:42

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: Replace hackish ext4_mb_poll_new_transaction with commit callback

On Sun, Oct 19, 2008 at 04:49:28PM -0600, Andreas Dilger wrote:
>
> The problem with the mechanism you've implemented is that it isn't
> possible to add any other kind of callback to the journal. There
> is only a single callback function, and the entries in the "t_private_list"
> are all assumed to be "ext4_free_data" structures so even if other
> users wanted to add callbacks they would only be handled by the
> release_blocks_on_commit() function.
>
> Is there any reason not to make this more generic and have the callback
> function pointer embedded in the "ext4_free_data" struct in some way
> that other callbacks could be registered? This would still avoid the
> need to allocate for each of these operations, but would make the
> callback mechanism more generic and useful.

Another possibility would be to simply re-point
sbi->s_journal->j_commit_callback() to a function like this:

static void new_commit_callback_func(journal_t *journal, transaction_t *txn)
{
do_new_callback_stuff(journal, txn)
release_blocks_on_commit(journal, txn)
}

Or Luster could initialize ext4, and then intercept the callback,
stash it away in a pointer, and then do this:

static void new_commit_callback_func(journal_t *journal, transaction_t *txn)
{
do_new_callback_stuff(journal, txn)
(*orig_value_of_j_commit_callback)(journal, txn)
}


If you want to be even *more* general, we could hang a struct
list_head off of struct ext4_sb_info which contains the linked list of
functions to be called, one of which would be release_blocks_on_commit(),
and then change sbi->s_journal->j_commit_callback to point to a function
which iterates over the functions in the list_head
sbi->s_commit_callback_funcs.

Do you have specific use in mind for Lustre? Could one of the above
schemes work for you?

- Ted