LinuxLists.cc - [RFC PATCH 1/4] ext4: DIO get

2009-12-16 01:37:58

Subject: [RFC PATCH 1/4] ext4: DIO get_block code cleanup

ext4: dio get_block code cleanup in prepare for it to be used by buffer write

Renaming the dio block allocation flags, variables and functions
introduced in Mingming's "Direct IO for holes and fallocate"
patches so that they can be used by ext4 buffer write as well.

Signed-off-by: Jiaying Zhang <[email protected]>
---
fs/ext4/ext4.h | 18 ++++----
fs/ext4/extents.c | 24 +++++------
fs/ext4/fsync.c | 2
fs/ext4/inode.c | 112 ++++++++++++++++++++++++++----------------------------
fs/ext4/super.c | 2
5 files changed, 78 insertions(+), 80 deletions(-)

Index: git-ext4/fs/ext4/extents.c
===================================================================
--- git-ext4.orig/fs/ext4/extents.c 2009-12-15 15:14:34.000000000 -0800
+++ git-ext4/fs/ext4/extents.c 2009-12-15 16:03:05.000000000 -0800
@@ -1603,7 +1603,7 @@ int ext4_ext_insert_extent(handle_t *han
BUG_ON(path[depth].p_hdr == NULL);

/* try to insert block into found extent and return */
- if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
&& ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
ext4_ext_is_uninitialized(newext),
@@ -1724,7 +1724,7 @@ has_space:

merge:
/* try to merge extents to the right */
- if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
ext4_ext_try_to_merge(inode, path, nearex);

/* try to merge extents to the left */
@@ -2966,7 +2966,7 @@ fix_extent_len:
ext4_ext_dirty(handle, inode, path + depth);
return err;
}
-static int ext4_convert_unwritten_extents_dio(handle_t *handle,
+static int ext4_convert_unwritten_extents_endio(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path)
{
@@ -3038,8 +3038,8 @@ ext4_ext_handle_uninitialized_extents(ha
flags, allocated);
ext4_ext_show_leaf(inode, path);

- /* DIO get_block() before submit the IO, split the extent */
- if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+ /* get_block() before submit the IO, split the extent */
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
ret = ext4_split_unwritten_extents(handle,
inode, path, iblock,
max_blocks, flags);
@@ -3049,14 +3049,14 @@ ext4_ext_handle_uninitialized_extents(ha
* completed
*/
if (io)
- io->flag = DIO_AIO_UNWRITTEN;
+ io->flag = EXT4_IO_UNWRITTEN;
else
EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
goto out;
}
- /* async DIO end_io complete, convert the filled extent to written */
- if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
- ret = ext4_convert_unwritten_extents_dio(handle, inode,
+ /* IO end_io complete, convert the filled extent to written */
+ if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
+ ret = ext4_convert_unwritten_extents_endio(handle, inode,
path);
goto out2;
}
@@ -3299,9 +3299,9 @@ int ext4_ext_get_blocks(handle_t *handle
* For non asycn direct IO case, flag the inode state
* that we need to perform convertion when IO is done.
*/
- if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
if (io)
- io->flag = DIO_AIO_UNWRITTEN;
+ io->flag = EXT4_IO_UNWRITTEN;
else
EXT4_I(inode)->i_state |=
EXT4_STATE_DIO_UNWRITTEN;;
@@ -3561,7 +3561,7 @@ int ext4_convert_unwritten_extents(struc
map_bh.b_state = 0;
ret = ext4_get_blocks(handle, inode, block,
max_blocks, &map_bh,
- EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
+ EXT4_GET_BLOCKS_IO_CONVERT_EXT);
if (ret <= 0) {
WARN_ON(ret <= 0);
printk(KERN_ERR "%s: ext4_ext_get_blocks "
Index: git-ext4/fs/ext4/ext4.h
===================================================================
--- git-ext4.orig/fs/ext4/ext4.h 2009-12-15 15:14:34.000000000 -0800
+++ git-ext4/fs/ext4/ext4.h 2009-12-15 16:03:05.000000000 -0800
@@ -133,7 +133,7 @@ struct mpage_da_data {
int pages_written;
int retval;
};
-#define DIO_AIO_UNWRITTEN 0x1
+#define EXT4_IO_UNWRITTEN 0x1
typedef struct ext4_io_end {
struct list_head list; /* per-file finished AIO list */
struct inode *inode; /* file being written to */
@@ -367,13 +367,13 @@ struct ext4_new_group_data {
/* caller is from the direct IO path, request to creation of an
unitialized extents if not allocated, split the uninitialized
extent if blocks has been preallocated already*/
-#define EXT4_GET_BLOCKS_DIO 0x0010
+#define EXT4_GET_BLOCKS_PRE_IO 0x0010
#define EXT4_GET_BLOCKS_CONVERT 0x0020
-#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
+ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ /* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
- /* Convert extent to initialized after direct IO complete */
-#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT
(EXT4_GET_BLOCKS_CONVERT|\
- EXT4_GET_BLOCKS_DIO_CREATE_EXT)

/*
* Flags used by ext4_free_blocks
@@ -707,8 +707,8 @@ struct ext4_inode_info {

spinlock_t i_block_reservation_lock;

- /* completed async DIOs that might need unwritten extents handling */
- struct list_head i_aio_dio_complete_list;
+ /* completed IOs that might need unwritten extents handling */
+ struct list_head i_completed_io_list;
/* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio;
};
@@ -1432,7 +1432,7 @@ extern int ext4_block_truncate_page(hand
struct address_space *mapping, loff_t from);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t ext4_get_reserved_space(struct inode *inode);
-extern int flush_aio_dio_completed_IO(struct inode *inode);
+extern int flush_completed_IO(struct inode *inode);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
Index: git-ext4/fs/ext4/super.c
===================================================================
--- git-ext4.orig/fs/ext4/super.c 2009-12-15 15:14:34.000000000 -0800
+++ git-ext4/fs/ext4/super.c 2009-12-15 16:03:05.000000000 -0800
@@ -711,7 +711,7 @@ static struct inode *ext4_alloc_inode(st
ei->i_allocated_meta_blocks = 0;
ei->i_delalloc_reserved_flag = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
- INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+ INIT_LIST_HEAD(&ei->i_completed_io_list);
ei->cur_aio_dio = NULL;

return &ei->vfs_inode;
Index: git-ext4/fs/ext4/fsync.c
===================================================================
--- git-ext4.orig/fs/ext4/fsync.c 2009-12-15 15:14:34.000000000 -0800
+++ git-ext4/fs/ext4/fsync.c 2009-12-15 16:03:05.000000000 -0800
@@ -58,7 +58,7 @@ int ext4_sync_file(struct file *file, st

trace_ext4_sync_file(file, dentry, datasync);

- ret = flush_aio_dio_completed_IO(inode);
+ ret = flush_completed_IO(inode);
if (ret < 0)
return ret;
/*
Index: git-ext4/fs/ext4/inode.c
===================================================================
--- git-ext4.orig/fs/ext4/inode.c 2009-12-15 15:14:34.000000000 -0800
+++ git-ext4/fs/ext4/inode.c 2009-12-15 16:03:05.000000000 -0800
@@ -3606,52 +3606,44 @@ out:
return ret;
}

-static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- handle_t *handle = NULL;
+ handle_t *handle = ext4_journal_current_handle();
int ret = 0;
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
int dio_credits;
+ int started = 0;

- ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
+ ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
inode->i_ino, create);
/*
- * DIO VFS code passes create = 0 flag for write to
- * the middle of file. It does this to avoid block
- * allocation for holes, to prevent expose stale data
- * out when there is parallel buffered read (which does
- * not hold the i_mutex lock) while direct IO write has
- * not completed. DIO request on holes finally falls back
- * to buffered IO for this reason.
- *
- * For ext4 extent based file, since we support fallocate,
- * new allocated extent as uninitialized, for holes, we
- * could fallocate blocks for holes, thus parallel
- * buffered IO read will zero out the page when read on
- * a hole while parallel DIO write to the hole has not completed.
- *
- * when we come here, we know it's a direct IO write to
- * to the middle of file (<i_size)
- * so it's safe to override the create flag from VFS.
- */
- create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
-
- if (max_blocks > DIO_MAX_BLOCKS)
- max_blocks = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
- handle = ext4_journal_start(inode, dio_credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
+ * ext4_get_block in prepare for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after IO complete.
+ */
+ create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
+
+ if (!handle) {
+ if (max_blocks > DIO_MAX_BLOCKS)
+ max_blocks = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ handle = ext4_journal_start(inode, dio_credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ started = 1;
}
+
ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
create);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
}
- ext4_journal_stop(handle);
+ if (started)
+ ext4_journal_stop(handle);
out:
return ret;
}
@@ -3662,19 +3654,20 @@ static void ext4_free_io_end(ext4_io_end
iput(io->inode);
kfree(io);
}
-static void dump_aio_dio_list(struct inode * inode)
+
+static void dump_completed_IO(struct inode * inode)
{
#ifdef EXT4_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;

- if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
- ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
+ if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+ ext4_debug("inode %lu completed_io list is empty\n",
inode->i_ino);
return;
}

- ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
- list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
+ ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+ list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
cur = &io->list;
before = cur->prev;
io0 = container_of(before, ext4_io_end_t, list);
@@ -3690,21 +3683,21 @@ static void dump_aio_dio_list(struct ino
/*
* check a range of space and convert unwritten extents to written.
*/
-static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
+static int ext4_end_io_nolock(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
size_t size = io->size;
int ret = 0;

- ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
+ ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);

if (list_empty(&io->list))
return ret;

- if (io->flag != DIO_AIO_UNWRITTEN)
+ if (io->flag != EXT4_IO_UNWRITTEN)
return ret;

if (offset + size <= i_size_read(inode))
@@ -3722,17 +3715,18 @@ static int ext4_end_aio_dio_nolock(ext4_
io->flag = 0;
return ret;
}
+
/*
* work on completed aio dio IO, to convert unwritten extents to extents
*/
-static void ext4_end_aio_dio_work(struct work_struct *work)
+static void ext4_end_io_work(struct work_struct *work)
{
ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
struct inode *inode = io->inode;
int ret = 0;

mutex_lock(&inode->i_mutex);
- ret = ext4_end_aio_dio_nolock(io);
+ ret = ext4_end_io_nolock(io);
if (ret >= 0) {
if (!list_empty(&io->list))
list_del_init(&io->list);
@@ -3740,32 +3734,35 @@ static void ext4_end_aio_dio_work(struct
}
mutex_unlock(&inode->i_mutex);
}
+
/*
* This function is called from ext4_sync_file().
*
- * When AIO DIO IO is completed, the work to convert unwritten
- * extents to written is queued on workqueue but may not get immediately
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
* scheduled. When fsync is called, we need to ensure the
* conversion is complete before fsync returns.
- * The inode keeps track of a list of completed AIO from DIO path
- * that might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents to written.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
*/
-int flush_aio_dio_completed_IO(struct inode *inode)
+int flush_completed_IO(struct inode *inode)
{
ext4_io_end_t *io;
int ret = 0;
int ret2 = 0;

- if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
+ if (list_empty(&EXT4_I(inode)->i_completed_io_list))
return ret;

- dump_aio_dio_list(inode);
- while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
- io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
+ dump_completed_IO(inode);
+ while (!list_empty(&EXT4_I(inode)->i_completed_io_list)){
+ io = list_entry(EXT4_I(inode)->i_completed_io_list.next,
ext4_io_end_t, list);
/*
- * Calling ext4_end_aio_dio_nolock() to convert completed
+ * Calling ext4_end_io_nolock() to convert completed
* IO to written.
*
* When ext4_sync_file() is called, run_queue() may already
@@ -3778,7 +3775,7 @@ int flush_aio_dio_completed_IO(struct in
* avoid double converting from both fsync and background work
* queue work.
*/
- ret = ext4_end_aio_dio_nolock(io);
+ ret = ext4_end_io_nolock(io);
if (ret < 0)
ret2 = ret;
else
@@ -3800,7 +3797,7 @@ static ext4_io_end_t *ext4_init_io_end (
io->offset = 0;
io->size = 0;
io->error = 0;
- INIT_WORK(&io->work, ext4_end_aio_dio_work);
+ INIT_WORK(&io->work, ext4_end_io_work);
INIT_LIST_HEAD(&io->list);
}

@@ -3823,7 +3820,7 @@ static void ext4_end_io_dio(struct kiocb
size);

/* if not aio dio with unwritten extents, just free io and return */
- if (io_end->flag != DIO_AIO_UNWRITTEN){
+ if (io_end->flag != EXT4_IO_UNWRITTEN){
ext4_free_io_end(io_end);
iocb->private = NULL;
return;
@@ -3838,9 +3835,10 @@ static void ext4_end_io_dio(struct kiocb

/* Add the io_end to per-inode completed aio dio list*/
list_add_tail(&io_end->list,
- &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
+ &EXT4_I(io_end->inode)->i_completed_io_list);
iocb->private = NULL;
}
+
/*
* For ext4 extent files, ext4 will do direct-io write to holes,
* preallocated extents, and those write extend the file, no need to
@@ -3910,7 +3908,7 @@ static ssize_t ext4_ext_direct_IO(int rw
ret = blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
- ext4_get_block_dio_write,
+ ext4_get_block_write,
ext4_end_io_dio);
if (iocb->private)
EXT4_I(inode)->cur_aio_dio = NULL;

2009-12-26 07:03:10

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] ext4: DIO get_block code cleanup

On Tue, Dec 15, 2009 at 05:37:53PM -0800, Jiaying Zhang wrote:
> ext4: dio get_block code cleanup in prepare for it to be used by buffer write
>
> Renaming the dio block allocation flags, variables and functions
> introduced in Mingming's "Direct IO for holes and fallocate"
> patches so that they can be used by ext4 buffer write as well.
>
> Signed-off-by: Jiaying Zhang <[email protected]>

Hi Jiaying,

Merry Christmas!

Sorry, between trying to prepare patches for the 2.6.33 merge window,
and closing out my current position at the Linux Foundation, and the
resulting job transition, this has fallen through the cracks. I've
started looking at your patch now. A couple of comments so far.

1) The patches were white-space damaged, so applying them is a bit
difficult. I gave up entirely on patch #4, which hopefully is a
mechnical only patch, but it's also one which is apparently only a
cleanup.

Patch #1 had a combination of mechanical changes (i.e., naming of
variables and constants) as well as some non-mechnical changes.
Especially in the case of patch #4, when large chunks of code are
being moved around, the patch which mechnical changes is very likely
to break if other patches are applied (or reverted) compared to
whatever version the patch was based off of. If the mechnical changes
are separated out (and clearly described in the commits), then I can
easily regenerate the mechnical changes, and not worry about
accidentally dropping changes.

2) In the first patch, in ext4_sync_file() in fs/ext4/fsync.c, the
call to flush_aio_dio_completed_IO() is renamed to
flush_completed_IO(). But then in the second patch, a second call to
flush_completed_IO() is added at the end of ext4_sync_file(). That
doesn't look right to me.

I need to look at the code more closely, which I will do later. In
the mean time, I've added the first three patches which you sent to
the unstable portion of the ext4 patch queue. When run against the
xfsqa regression test suite, it quickly shows a failure:

BUG: unable to handle kernel NULL pointer dereference at 00000004
IP: [<c0262b58>] __journal_remove_journal_head+0xf/0xcf
*pdpt = 0000000019b91001 *pde = 0000000000000000
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
last sysfs file:
Modules linked in:

Pid: 6536, comm: jbd2/sdb-8 Not tainted 2.6.32-06814-g26716e4 #225 /
EIP: 0060:[<c0262b58>] EFLAGS: 00010246 CPU: 0
EIP is at __journal_remove_journal_head+0xf/0xcf
EAX: de570e40 EBX: 00000000 ECX: 00000000 EDX: de570e40
ESI: de570e40 EDI: 00000002 EBP: d8a94e5c ESP: d8a94e54
DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
Process jbd2/sdb-8 (pid: 6536, ti=d8a94000 task=df3bbf90 task.ti=d8a94000)
Stack:
de570e40 de570e40 d8a94e68 c0262c87 d4b7c140 d8a94e8c c025fe4b 00000001
<0> d8a94e9c d4b7c140 d8f630c0 00000000 cf6ba540 d8a94e9c d8a94eac c025fed9
<0> dbc50d80 dbc50d80 00000000 dc754800 00000000 dc754800 d8a94f5c c025e20a
Call Trace:
[<c0262c87>] ? jbd2_journal_remove_journal_head+0x1e/0x2d
[<c025fe4b>] ? journal_clean_one_cp_list+0x68/0xbd
[<c025fed9>] ? __jbd2_journal_clean_checkpoint_list+0x39/0x7b
[<c025e20a>] ? jbd2_journal_commit_transaction+0x2c5/0x1173
[<c017bb17>] ? trace_hardirqs_off+0xb/0xd
[<c01631ec>] ? try_to_del_timer_sync+0x5e/0x66
[<c062eae5>] ? _spin_unlock_irqrestore+0x41/0x4d
[<c017d019>] ? trace_hardirqs_on+0xb/0xd
[<c01631ec>] ? try_to_del_timer_sync+0x5e/0x66
[<c01635bb>] ? del_timer_sync+0x78/0x88
[<c0263b66>] ? kjournald2+0x116/0x309
[<c016d9b0>] ? autoremove_wake_function+0x0/0x34
[<c0263a50>] ? kjournald2+0x0/0x309
[<c016d785>] ? kthread+0x64/0x69
[<c016d721>] ? kthread+0x0/0x69
[<c012567b>] ? kernel_thread_helper+0x7/0x10
Code: f6 ff eb 15 89 d8 e8 97 91 f7 ff eb 0c e8 71 ff ff ff 89 da e8 8c 3e f8 ff 5b 5d c3 55 89 e5 56 53 0f 1f 44 00 00 8b 58 24 89 c6 <83> 7b 04 00 79 04 0f 0b eb fe f0 ff 40 34 83 7b 04 00 0f 85 a1
EIP: [<c0262b58>] __journal_remove_journal_head+0xf/0xcf SS:ESP 0068:d8a94e54
CR2: 0000000000000004

Removing your three patches makes the failure go away. I'll analyze
this some more later this weekend.

Best regards,

- Ted

2009-12-27 09:24:13

by Jiaying Zhang

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] ext4: DIO get_block code cleanup

Hi Ted,

Thanks a lot for the feedback on Christmas and happy holiday!

On Fri, Dec 25, 2009 at 11:03 PM, <[email protected]> wrote:
> On Tue, Dec 15, 2009 at 05:37:53PM -0800, Jiaying Zhang wrote:
>> ext4: dio get_block code cleanup in prepare for it to be used by buffer write
>>
>> Renaming the dio block allocation flags, variables and functions
>> introduced in Mingming's "Direct IO for holes and fallocate"
>> patches so that they can be used by ext4 buffer write as well.
>>
>> Signed-off-by: Jiaying Zhang <[email protected]>
>
> Hi Jiaying,
>
> Merry Christmas!
>
> Sorry, between trying to prepare patches for the 2.6.33 merge window,
> and closing out my current position at the Linux Foundation, and the
> resulting job transition, this has fallen through the cracks. ?I've
> started looking at your patch now. ? A couple of comments so far.
>
> 1) The patches were white-space damaged, so applying them is a bit
> difficult. ?I gave up entirely on patch #4, which hopefully is a
> mechnical only patch, but it's also one which is apparently only a
> cleanup.

Really sorry about this. I guess my email client turned tab into spaces.
The fourth patch is only code moving around. As you said, you can easily
regenerate it later so I think we can just leave it out during the review.

>
> Patch #1 had a combination of mechanical changes (i.e., naming of
> variables and constants) as well as some non-mechnical changes.
> Especially in the case of patch #4, when large chunks of code are
> being moved around, the patch which mechnical changes is very likely
> to break if other patches are applied (or reverted) compared to
> whatever version the patch was based off of. ?If the mechnical changes
> are separated out (and clearly described in the commits), then I can
> easily regenerate the mechnical changes, and not worry about
> accidentally dropping changes.

I intended to have the first patch only include the mechanical changes,
but left some other changes in it after code rebase. I will move those
changes to the second patch in the next version.

>
> 2) In the first patch, in ext4_sync_file() in fs/ext4/fsync.c, the
> call to flush_aio_dio_completed_IO() is renamed to
> flush_completed_IO(). ?But then in the second patch, a second call to
> flush_completed_IO() is added at the end of ext4_sync_file(). ?That
> doesn't look right to me.

The reason I added another flush_completed_IO() at the end of
ext4_sync_file is to finish uninitialized-to-initialized extent conversion
for any buffer writes that may be generated during sync_mapping_buffers.
The completed_io_list may not be empty at this time because
ext4_end_io_buffer_write only queues io_end work that is processed
by ext4_end_io_work later. Is there any problem that you think this
change may introduce?

>
> I need to look at the code more closely, which I will do later. ?In
> the mean time, I've added the first three patches which you sent to
> the unstable portion of the ext4 patch queue. ?When run against the
> xfsqa regression test suite, it quickly shows a failure:
>
> BUG: unable to handle kernel NULL pointer dereference at 00000004
> IP: [<c0262b58>] __journal_remove_journal_head+0xf/0xcf
> *pdpt = 0000000019b91001 *pde = 0000000000000000
> Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
> last sysfs file:
> Modules linked in:
>
> Pid: 6536, comm: jbd2/sdb-8 Not tainted 2.6.32-06814-g26716e4 #225 /
> EIP: 0060:[<c0262b58>] EFLAGS: 00010246 CPU: 0
> EIP is at __journal_remove_journal_head+0xf/0xcf
> EAX: de570e40 EBX: 00000000 ECX: 00000000 EDX: de570e40
> ESI: de570e40 EDI: 00000002 EBP: d8a94e5c ESP: d8a94e54
> ?DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
> Process jbd2/sdb-8 (pid: 6536, ti=d8a94000 task=df3bbf90 task.ti=d8a94000)
> Stack:
> ?de570e40 de570e40 d8a94e68 c0262c87 d4b7c140 d8a94e8c c025fe4b 00000001
> <0> d8a94e9c d4b7c140 d8f630c0 00000000 cf6ba540 d8a94e9c d8a94eac c025fed9
> <0> dbc50d80 dbc50d80 00000000 dc754800 00000000 dc754800 d8a94f5c c025e20a
> Call Trace:
> ?[<c0262c87>] ? jbd2_journal_remove_journal_head+0x1e/0x2d
> ?[<c025fe4b>] ? journal_clean_one_cp_list+0x68/0xbd
> ?[<c025fed9>] ? __jbd2_journal_clean_checkpoint_list+0x39/0x7b
> ?[<c025e20a>] ? jbd2_journal_commit_transaction+0x2c5/0x1173
> ?[<c017bb17>] ? trace_hardirqs_off+0xb/0xd
> ?[<c01631ec>] ? try_to_del_timer_sync+0x5e/0x66
> ?[<c062eae5>] ? _spin_unlock_irqrestore+0x41/0x4d
> ?[<c017d019>] ? trace_hardirqs_on+0xb/0xd
> ?[<c01631ec>] ? try_to_del_timer_sync+0x5e/0x66
> ?[<c01635bb>] ? del_timer_sync+0x78/0x88
> ?[<c0263b66>] ? kjournald2+0x116/0x309
> ?[<c016d9b0>] ? autoremove_wake_function+0x0/0x34
> ?[<c0263a50>] ? kjournald2+0x0/0x309
> ?[<c016d785>] ? kthread+0x64/0x69
> ?[<c016d721>] ? kthread+0x0/0x69
> ?[<c012567b>] ? kernel_thread_helper+0x7/0x10
> Code: f6 ff eb 15 89 d8 e8 97 91 f7 ff eb 0c e8 71 ff ff ff 89 da e8 8c 3e f8 ff 5b 5d c3 55 89 e5 56 53 0f 1f 44 00 00 8b 58 24 89 c6 <83> 7b 04 00 79 04 0f 0b eb fe f0 ff 40 34 83 7b 04 00 0f 85 a1
> EIP: [<c0262b58>] __journal_remove_journal_head+0xf/0xcf SS:ESP 0068:d8a94e54
> CR2: 0000000000000004
>

From the stack trace, the problem seems to be caused by
the use of bh->b_private to hold end_io in my patch. I noticed
that b_private is used by jdb2 to hold journal handle so the
introduced change is disabled if the ext4 is mounted with
data=journal. As I understand, with data=ordered, any buffer
heads held in the journal should be metadata. But I guess either
this is not sure or there is some case where I also changed
b_private field for metadata write.

Jiaying

> Removing your three patches makes the failure go away. ?I'll analyze
> this some more later this weekend.
>
> Best regards,
>
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?- Ted
>

2009-12-29 06:26:01

by Jiaying Zhang

[permalink] [raw]

Subject: Re: [RFC PATCH 1/4] ext4: DIO get_block code cleanup

Hi Ted,

On Sun, Dec 27, 2009 at 1:24 AM, Jiaying Zhang <[email protected]> wrote:
> Hi Ted,
>
> Thanks a lot for the feedback on Christmas and happy holiday!
>
> On Fri, Dec 25, 2009 at 11:03 PM, ?<[email protected]> wrote:
>> On Tue, Dec 15, 2009 at 05:37:53PM -0800, Jiaying Zhang wrote:
>>> ext4: dio get_block code cleanup in prepare for it to be used by buffer write
>>>
>>> Renaming the dio block allocation flags, variables and functions
>>> introduced in Mingming's "Direct IO for holes and fallocate"
>>> patches so that they can be used by ext4 buffer write as well.
>>>
>>> Signed-off-by: Jiaying Zhang <[email protected]>
>>
>> Hi Jiaying,
>>
>> Merry Christmas!
>>
>> Sorry, between trying to prepare patches for the 2.6.33 merge window,
>> and closing out my current position at the Linux Foundation, and the
>> resulting job transition, this has fallen through the cracks. ?I've
>> started looking at your patch now. ? A couple of comments so far.
>>
>> 1) The patches were white-space damaged, so applying them is a bit
>> difficult. ?I gave up entirely on patch #4, which hopefully is a
>> mechnical only patch, but it's also one which is apparently only a
>> cleanup.
>
> Really sorry about this. I guess my email client turned tab into spaces.
> The fourth patch is only code moving around. As you said, you can easily
> regenerate it later so I think we can just leave it out during the review.
>
>>
>> Patch #1 had a combination of mechanical changes (i.e., naming of
>> variables and constants) as well as some non-mechnical changes.
>> Especially in the case of patch #4, when large chunks of code are
>> being moved around, the patch which mechnical changes is very likely
>> to break if other patches are applied (or reverted) compared to
>> whatever version the patch was based off of. ?If the mechnical changes
>> are separated out (and clearly described in the commits), then I can
>> easily regenerate the mechnical changes, and not worry about
>> accidentally dropping changes.
>
> I intended to have the first patch only include the mechanical changes,
> but left some other changes in it after code rebase. I will move those
> changes to the second patch in the next version.
>
>>
>> 2) In the first patch, in ext4_sync_file() in fs/ext4/fsync.c, the
>> call to flush_aio_dio_completed_IO() is renamed to
>> flush_completed_IO(). ?But then in the second patch, a second call to
>> flush_completed_IO() is added at the end of ext4_sync_file(). ?That
>> doesn't look right to me.
>
> The reason I added another flush_completed_IO() at the end of
> ext4_sync_file is to finish uninitialized-to-initialized extent conversion
> for any buffer writes that may be generated during sync_mapping_buffers.
> The completed_io_list may not be empty at this time because
> ext4_end_io_buffer_write only queues io_end work that is processed
> by ext4_end_io_work later. Is there any problem that you think this
> change may introduce?
>
I realized that the later sync_mapping_buffers in ext4_sync_file
should only write metadata buffers after reading the function again.
So you are right. We don't need the send flush_completed_IO().

>>
>> I need to look at the code more closely, which I will do later. ?In
>> the mean time, I've added the first three patches which you sent to
>> the unstable portion of the ext4 patch queue. ?When run against the
>> xfsqa regression test suite, it quickly shows a failure:
>>
>> BUG: unable to handle kernel NULL pointer dereference at 00000004
>> IP: [<c0262b58>] __journal_remove_journal_head+0xf/0xcf
>> *pdpt = 0000000019b91001 *pde = 0000000000000000
>> Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
>> last sysfs file:
>> Modules linked in:
>>
>> Pid: 6536, comm: jbd2/sdb-8 Not tainted 2.6.32-06814-g26716e4 #225 /
>> EIP: 0060:[<c0262b58>] EFLAGS: 00010246 CPU: 0
>> EIP is at __journal_remove_journal_head+0xf/0xcf
>> EAX: de570e40 EBX: 00000000 ECX: 00000000 EDX: de570e40
>> ESI: de570e40 EDI: 00000002 EBP: d8a94e5c ESP: d8a94e54
>> ?DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
>> Process jbd2/sdb-8 (pid: 6536, ti=d8a94000 task=df3bbf90 task.ti=d8a94000)
>> Stack:
>> ?de570e40 de570e40 d8a94e68 c0262c87 d4b7c140 d8a94e8c c025fe4b 00000001
>> <0> d8a94e9c d4b7c140 d8f630c0 00000000 cf6ba540 d8a94e9c d8a94eac c025fed9
>> <0> dbc50d80 dbc50d80 00000000 dc754800 00000000 dc754800 d8a94f5c c025e20a
>> Call Trace:
>> ?[<c0262c87>] ? jbd2_journal_remove_journal_head+0x1e/0x2d
>> ?[<c025fe4b>] ? journal_clean_one_cp_list+0x68/0xbd
>> ?[<c025fed9>] ? __jbd2_journal_clean_checkpoint_list+0x39/0x7b
>> ?[<c025e20a>] ? jbd2_journal_commit_transaction+0x2c5/0x1173
>> ?[<c017bb17>] ? trace_hardirqs_off+0xb/0xd
>> ?[<c01631ec>] ? try_to_del_timer_sync+0x5e/0x66
>> ?[<c062eae5>] ? _spin_unlock_irqrestore+0x41/0x4d
>> ?[<c017d019>] ? trace_hardirqs_on+0xb/0xd
>> ?[<c01631ec>] ? try_to_del_timer_sync+0x5e/0x66
>> ?[<c01635bb>] ? del_timer_sync+0x78/0x88
>> ?[<c0263b66>] ? kjournald2+0x116/0x309
>> ?[<c016d9b0>] ? autoremove_wake_function+0x0/0x34
>> ?[<c0263a50>] ? kjournald2+0x0/0x309
>> ?[<c016d785>] ? kthread+0x64/0x69
>> ?[<c016d721>] ? kthread+0x0/0x69
>> ?[<c012567b>] ? kernel_thread_helper+0x7/0x10
>> Code: f6 ff eb 15 89 d8 e8 97 91 f7 ff eb 0c e8 71 ff ff ff 89 da e8 8c 3e f8 ff 5b 5d c3 55 89 e5 56 53 0f 1f 44 00 00 8b 58 24 89 c6 <83> 7b 04 00 79 04 0f 0b eb fe f0 ff 40 34 83 7b 04 00 0f 85 a1
>> EIP: [<c0262b58>] __journal_remove_journal_head+0xf/0xcf SS:ESP 0068:d8a94e54
>> CR2: 0000000000000004
>>
>
> From the stack trace, the problem seems to be caused by
> the use of bh->b_private to hold end_io in my patch. I noticed
> that b_private is used by jdb2 to hold journal handle so the
> introduced change is disabled if the ext4 is mounted with
> data=journal. As I understand, with data=ordered, any buffer
> heads held in the journal should be metadata. But I guess either
> this is not sure or there is some case where I also changed
> b_private field for metadata write.

I had another look at my patch and found that there might be some
case that we still call ext4_end_io_buffer_write() even with
data=journal mode. Could you try the patch below on top of
my previous three patches and see whether it fixes the problem?
Or even better, I would like to run the xfsqa test myself if you
could point me to the instructions on how to get and run it.

Thanks a lot!

fs/ext4/extents.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)

Index: git-ext4/fs/ext4/extents.c
===================================================================
--- git-ext4.orig/fs/ext4/extents.c 2009-12-15 16:03:15.000000000 -0800
+++ git-ext4/fs/ext4/extents.c 2009-12-28 22:17:12.000000000 -0800
@@ -3052,7 +3052,8 @@ ext4_ext_handle_uninitialized_extents(ha
io->flag = EXT4_IO_UNWRITTEN;
else
EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
- set_buffer_uninit(bh_result);
+ if (test_opt(inode->i_sb, DIOREAD_NOLOCK))
+ set_buffer_uninit(bh_result);
goto out;
}
/* IO end_io complete, convert the filled extent to written */
@@ -3305,7 +3306,8 @@ int ext4_ext_get_blocks(handle_t *handle
EXT4_I(inode)->i_state |=
EXT4_STATE_DIO_UNWRITTEN;;
}
- set_buffer_uninit(bh_result);
+ if (test_opt(inode->i_sb, DIOREAD_NOLOCK))
+ set_buffer_uninit(bh_result);
}
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {

Jiaying

>
> Jiaying
>
>> Removing your three patches makes the failure go away. ?I'll analyze
>> this some more later this weekend.
>>
>> Best regards,
>>
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?- Ted
>>
>