2009-09-04 00:44:51

by Mingming Cao

[permalink] [raw]
Subject: [PATCH 2/2 V3] allow direct IO to fallocate and holes

ext4: Use end_io call back to avoid direct I/O fallback to buffered I/O

From: Mingming <[email protected]>

Currently the DIO VFS code passes create = 0 when writing to the
middle of file. It does this to avoid block allocation for holes, so
as not to expose stale data out when there is a parallel buffered read
(which does not hold the i_mutex lock). Direct I/O writes into holes
falls back to buffered IO for this reason.

Since preallocated extents are treated as holes when doing a
get_block() look up (buffer is not mapped), direct IO over fallocate
also falls back to buffered IO. Thus ext4 actually silently falls
back to buffered IO in above two cases, which is undesirable.

To fix this, this patch creates unitialized extents when a direct I/O
write into holes in sparse files, and registering an end_io callback which
converts the uninitialized extent to an initialized extent after the
I/O is completed.

Singed-Off-By: Mingming Cao <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4.h | 14 +++
fs/ext4/inode.c | 199 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
fs/ext4/super.c | 11 +++
3 files changed, 223 insertions(+), 1 deletion(-)

Index: linux-2.6.31-rc4/fs/ext4/ext4.h
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/ext4.h
+++ linux-2.6.31-rc4/fs/ext4/ext4.h
@@ -978,6 +978,9 @@ struct ext4_sb_info {

unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
+
+ /* workqueue for dio unwritten */
+ struct workqueue_struct *dio_unwritten_wq;
};

static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
Index: linux-2.6.31-rc4/fs/ext4/super.c
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/super.c
+++ linux-2.6.31-rc4/fs/ext4/super.c
@@ -578,6 +578,9 @@ static void ext4_put_super(struct super_
struct ext4_super_block *es = sbi->s_es;
int i, err;

+ flush_workqueue(sbi->dio_unwritten_wq);
+ destroy_workqueue(sbi->dio_unwritten_wq);
+
lock_super(sb);
lock_kernel();
if (sb->s_dirt)
@@ -2781,6 +2784,12 @@ no_journal:
clear_opt(sbi->s_mount_opt, NOBH);
}
}
+ EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+ if (!EXT4_SB(sb)->dio_unwritten_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+ goto failed_mount_wq;
+ }
+
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
@@ -2893,6 +2902,8 @@ cantfind_ext4:

failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
+ destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+failed_mount_wq:
ext4_release_system_zone(sb);
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
Index: linux-2.6.31-rc4/fs/ext4/inode.c
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/inode.c
+++ linux-2.6.31-rc4/fs/ext4/inode.c
@@ -37,6 +37,7 @@
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
+#include <linux/workqueue.h>

#include "ext4_jbd2.h"
#include "xattr.h"
@@ -3282,6 +3283,8 @@ static int ext4_releasepage(struct page
}

/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
* if the machine crashes during the write.
@@ -3290,7 +3293,7 @@ static int ext4_releasepage(struct page
* crashes then stale disk data _may_ be exposed inside the file. But current
* VFS code falls back into buffered path in that case so we are safe.
*/
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
@@ -3364,6 +3367,198 @@ out:
return ret;
}

+/* Maximum number of blocks we map for direct IO at once. */
+
+static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ handle_t *handle = NULL;
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int dio_credits;
+
+ /*
+ * DIO VFS code passes create = 0 flag for write to
+ * the middle of file. It does this to avoid block
+ * allocation for holes, to prevent expose stale data
+ * out when there is parallel buffered read (which does
+ * not hold the i_mutex lock) while direct IO write has
+ * not completed. DIO request on holes finally falls back
+ * to buffered IO for this reason.
+ *
+ * For ext4 extent based file, since we support fallocate,
+ * new allocated extent as uninitialized, for holes, we
+ * could fallocate blocks for holes, thus parallel
+ * buffered IO read will zero out the page when read on
+ * a hole while parallel DIO write to the hole has not completed.
+ *
+ * when we come here, we know it's a direct IO write to
+ * to the middle of file (<i_size)
+ * so it's safe to override the create flag from VFS.
+ */
+ create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+
+ if (max_blocks > DIO_MAX_BLOCKS)
+ max_blocks = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ handle = ext4_journal_start(inode, dio_credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+ create);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ ext4_journal_stop(handle);
+out:
+ return ret;
+}
+
+#define DIO_AIO 0x1
+
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+ kfree(io);
+}
+
+/*
+ * IO write completion for unwritten extents.
+ *
+ * check a range of space and convert unwritten extents to written.
+ */
+static void ext4_end_dio_unwritten(struct work_struct *work)
+{
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ struct inode *inode = io->inode;
+ loff_t offset = io->offset;
+ size_t size = io->size;
+ int ret = 0;
+ int aio = io->flag & DIO_AIO;
+
+ if (aio)
+ mutex_lock(&inode->i_mutex);
+ if (offset + size <= i_size_read(inode))
+ ret = ext4_convert_unwritten_extents(inode, offset, size);
+
+ if (ret < 0)
+ printk(KERN_EMERG "%s: failed to convert unwritten"
+ "extents to written extents, error is %d\n",
+ __func__, ret);
+
+ ext4_free_io_end(io);
+ if (aio)
+ mutex_unlock(&inode->i_mutex);
+}
+
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag)
+{
+ ext4_io_end_t *io = NULL;
+
+ io = kmalloc(sizeof(*io), GFP_NOFS);
+
+ if (io) {
+ io->inode = inode;
+ io->flag = flag;
+ io->offset = 0;
+ io->size = 0;
+ io->error = 0;
+ INIT_WORK(&io->work, ext4_end_dio_unwritten);
+ }
+
+ return io;
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private)
+{
+ ext4_io_end_t *io_end = iocb->private;
+ struct workqueue_struct *wq;
+
+ /* if not hole or unwritten extents, just simple return */
+ if (!io_end || !size || !iocb->private)
+ return;
+ io_end->offset = offset;
+ io_end->size = size;
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+ /* We need to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+
+ if (is_sync_kiocb(iocb))
+ flush_workqueue(wq);
+
+ iocb->private = NULL;
+}
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unintialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as unintialized.
+ *
+ * When end_io call back function called at the last IO complete time,
+ * those extents will be converted to written extents.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list. So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+ size_t count = iov_length(iov, nr_segs);
+
+ loff_t final_size = offset + count;
+ if (rw == WRITE && final_size <= inode->i_size) {
+ /*
+ * For DIO we fallocate blocks for holes, we fallocate blocks
+ * The fallocated extent for hole is marked as uninitialized
+ * to prevent paralel buffered read to expose the stale data
+ * before DIO complete the data IO.
+ * as for previously fallocated extents, ext4 get_block
+ * will just simply mark the buffer mapped but still
+ * keep the extents uninitialized.
+ *
+ * At the end of IO, the ext4 end_io callback function
+ * will convert those unwritten extents to written,
+ *
+ */
+ iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb));
+ if (!iocb->private)
+ return -ENOMEM;
+ ret = blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_dio_write,
+ ext4_end_io_dio);
+ return ret;
+ }
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
/*
* Pages can be marked dirty completely asynchronously from ext4's journalling
* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do




2009-09-07 21:57:50

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 2/2 V3] allow direct IO to fallocate and holes

Hi Minming,

the patch looks cleaner now.

On Thu 03-09-09 17:44:50, Mingming wrote:
> Index: linux-2.6.31-rc4/fs/ext4/inode.c
> ===================================================================
> --- linux-2.6.31-rc4.orig/fs/ext4/inode.c
> +++ linux-2.6.31-rc4/fs/ext4/inode.c
...
> +#define DIO_AIO 0x1
This flag isn't set anywhere...

> +static void ext4_free_io_end(ext4_io_end_t *io)
> +{
> + kfree(io);
> +}
> +
> +/*
> + * IO write completion for unwritten extents.
> + *
> + * check a range of space and convert unwritten extents to written.
> + */
> +static void ext4_end_dio_unwritten(struct work_struct *work)
> +{
> + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> + struct inode *inode = io->inode;
> + loff_t offset = io->offset;
> + size_t size = io->size;
> + int ret = 0;
> + int aio = io->flag & DIO_AIO;
> +
> + if (aio)
> + mutex_lock(&inode->i_mutex);
> + if (offset + size <= i_size_read(inode))
> + ret = ext4_convert_unwritten_extents(inode, offset, size);
> +
> + if (ret < 0)
> + printk(KERN_EMERG "%s: failed to convert unwritten"
> + "extents to written extents, error is %d\n",
> + __func__, ret);
> +
> + ext4_free_io_end(io);
> + if (aio)
> + mutex_unlock(&inode->i_mutex);
> +}
> +
> +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag)
> +{
> + ext4_io_end_t *io = NULL;
> +
> + io = kmalloc(sizeof(*io), GFP_NOFS);
> +
> + if (io) {
> + io->inode = inode;
You should __iget() the inode here and iput() it in the end IO handler at
least in the AIO case so that the inode remains pinned in memory.

> + io->flag = flag;
> + io->offset = 0;
> + io->size = 0;
> + io->error = 0;
> + INIT_WORK(&io->work, ext4_end_dio_unwritten);
> + }
> +
> + return io;
> +}
> +
> +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> + ssize_t size, void *private)
> +{
> + ext4_io_end_t *io_end = iocb->private;
> + struct workqueue_struct *wq;
> +
> + /* if not hole or unwritten extents, just simple return */
> + if (!io_end || !size || !iocb->private)
> + return;
> + io_end->offset = offset;
> + io_end->size = size;
> + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> +
> + /* We need to convert unwritten extents to written */
> + queue_work(wq, &io_end->work);
> +
> + if (is_sync_kiocb(iocb))
> + flush_workqueue(wq);
You have spaces instead of tabs above.

I think fsync() still won't work correctly since it can happen user sees
AIO completed, calls fsync() that completes, but the conversion of extents
still has not happened because the conversion thread as not run yet.
The simple solution would be so flush_workqueue() from ext4_sync_fs()
and ext4_fsync(). But that would needlessly make fsync() wait for
conversion in unrelated files. More sophisticated solution would be to
attach io_end structures to the inode and do the work described by them
in ext4_fsync() (ext4_sync_fs() is fine doing flush_workqueue() since it
has to do all the work anyway). But in this solution you would have to be
careful to avoid races of fsync() and the completion thread.
Besides these problems the patch looks good.

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2009-09-09 20:51:01

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH 2/2 V3] allow direct IO to fallocate and holes

On Mon, 2009-09-07 at 23:57 +0200, Jan Kara wrote:
> Hi Minming,
>
> the patch looks cleaner now.
>

Hi Jan

Thanks for your review:)

> On Thu 03-09-09 17:44:50, Mingming wrote:
> > Index: linux-2.6.31-rc4/fs/ext4/inode.c
> > ===================================================================
> > --- linux-2.6.31-rc4.orig/fs/ext4/inode.c
> > +++ linux-2.6.31-rc4/fs/ext4/inode.c
> ...
> > +#define DIO_AIO 0x1
> This flag isn't set anywhere...
>
> > +static void ext4_free_io_end(ext4_io_end_t *io)
> > +{
> > + kfree(io);
> > +}
> > +
> > +/*
> > + * IO write completion for unwritten extents.
> > + *
> > + * check a range of space and convert unwritten extents to written.
> > + */
> > +static void ext4_end_dio_unwritten(struct work_struct *work)
> > +{
> > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > + struct inode *inode = io->inode;
> > + loff_t offset = io->offset;
> > + size_t size = io->size;
> > + int ret = 0;
> > + int aio = io->flag & DIO_AIO;
> > +
> > + if (aio)
> > + mutex_lock(&inode->i_mutex);
> > + if (offset + size <= i_size_read(inode))
> > + ret = ext4_convert_unwritten_extents(inode, offset, size);
> > +
> > + if (ret < 0)
> > + printk(KERN_EMERG "%s: failed to convert unwritten"
> > + "extents to written extents, error is %d\n",
> > + __func__, ret);
> > +
> > + ext4_free_io_end(io);
> > + if (aio)
> > + mutex_unlock(&inode->i_mutex);
> > +}
> > +
> > +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag)
> > +{
> > + ext4_io_end_t *io = NULL;
> > +
> > + io = kmalloc(sizeof(*io), GFP_NOFS);
> > +
> > + if (io) {
> > + io->inode = inode;
> You should __iget() the inode here and iput() it in the end IO handler at
> least in the AIO case so that the inode remains pinned in memory.
>

I just tried to add this, but hit some errors. I will dig more.
> > + io->flag = flag;
> > + io->offset = 0;
> > + io->size = 0;
> > + io->error = 0;
> > + INIT_WORK(&io->work, ext4_end_dio_unwritten);
> > + }
> > +
> > + return io;
> > +}
> > +
> > +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> > + ssize_t size, void *private)
> > +{
> > + ext4_io_end_t *io_end = iocb->private;
> > + struct workqueue_struct *wq;
> > +
> > + /* if not hole or unwritten extents, just simple return */
> > + if (!io_end || !size || !iocb->private)
> > + return;
> > + io_end->offset = offset;
> > + io_end->size = size;
> > + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> > +
> > + /* We need to convert unwritten extents to written */
> > + queue_work(wq, &io_end->work);
> > +
> > + if (is_sync_kiocb(iocb))
> > + flush_workqueue(wq);
> You have spaces instead of tabs above.
>
> I think fsync() still won't work correctly since it can happen user sees
> AIO completed, calls fsync() that completes,

hmm, does fsync() ensure user sees AIO data completed?

> but the conversion of extents
> still has not happened because the conversion thread as not run yet.
> The simple solution would be so flush_workqueue() from ext4_sync_fs()
> and ext4_fsync(). But that would needlessly make fsync() wait for
> conversion in unrelated files. More sophisticated solution would be to
> attach io_end structures to the inode and do the work described by them
> in ext4_fsync() (ext4_sync_fs() is fine doing flush_workqueue() since it
> has to do all the work anyway). But in this solution you would have to be
> careful to avoid races of fsync() and the completion thread.

Yeah maybe link the list of io_end structures to the inode, and use a
lock to protect that link... All AIO related issues (including ext3)
seems should addressed in another patch, you agree? Or we should address
them here?

Mingming
> Besides these problems the patch looks good.
>
> Honza



2009-09-10 07:57:31

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH 2/2 V3] allow direct IO to fallocate and holes

On Wed, Sep 09, 2009 at 01:51:02PM -0700, Mingming wrote:
.....
.....

> > I think fsync() still won't work correctly since it can happen user sees
> > AIO completed, calls fsync() that completes,
>
> hmm, does fsync() ensure user sees AIO data completed?
>

If we call fsync after getting AIO completion event and crash we should
ensure that the data can be read back properly. That is either

a) we should ensure that we convert the extent before returning the io
completion event
b) Or the fsync should be able to guarantee that it will force the extent
conversion pending on the file.

-aneesh

2009-09-10 08:54:41

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 2/2 V3] allow direct IO to fallocate and holes

On Thu 10-09-09 13:27:30, Aneesh Kumar K.V wrote:
> On Wed, Sep 09, 2009 at 01:51:02PM -0700, Mingming wrote:
> .....
> .....
>
> > > I think fsync() still won't work correctly since it can happen user sees
> > > AIO completed, calls fsync() that completes,
> >
> > hmm, does fsync() ensure user sees AIO data completed?
> >
>
> If we call fsync after getting AIO completion event and crash we should
> ensure that the data can be read back properly. That is either
>
> a) we should ensure that we convert the extent before returning the io
> completion event
> b) Or the fsync should be able to guarantee that it will force the extent
> conversion pending on the file.
Exactly.

a) should happen in the sync-io case where we can afford to do IO from the
end_io callback (and due to flush_workqueue call, it happens with the
current Mingming's patch so that is file). But for tha aio case, we have to
somehow implement b).

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2009-09-10 20:17:03

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH 2/2 V3] allow direct IO to fallocate and holes

On Thu, 2009-09-10 at 10:54 +0200, Jan Kara wrote:
> On Thu 10-09-09 13:27:30, Aneesh Kumar K.V wrote:
> > On Wed, Sep 09, 2009 at 01:51:02PM -0700, Mingming wrote:
> > .....
> > .....
> >
> > > > I think fsync() still won't work correctly since it can happen user sees
> > > > AIO completed, calls fsync() that completes,
> > >
> > > hmm, does fsync() ensure user sees AIO data completed?
> > >
> >
> > If we call fsync after getting AIO completion event and crash we should
> > ensure that the data can be read back properly. That is either
> >
> > a) we should ensure that we convert the extent before returning the io
> > completion event
> > b) Or the fsync should be able to guarantee that it will force the extent
> > conversion pending on the file.
> Exactly.
>
> a) should happen in the sync-io case where we can afford to do IO from the
> end_io callback (and due to flush_workqueue call, it happens with the
> current Mingming's patch so that is file). But for tha aio case, we have to
> somehow implement b).
>
> Honza


to implement b), I think we need to keep track of a list of completed
IOs from AIO, but not get converted extents, and force fsync to run
flush_queue on those completed IOs?

Mingming


2009-09-14 11:25:47

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 2/2 V3] allow direct IO to fallocate and holes

On Thu 10-09-09 13:17:00, Mingming wrote:
> On Thu, 2009-09-10 at 10:54 +0200, Jan Kara wrote:
> > On Thu 10-09-09 13:27:30, Aneesh Kumar K.V wrote:
> > > On Wed, Sep 09, 2009 at 01:51:02PM -0700, Mingming wrote:
> > > .....
> > > .....
> > >
> > > > > I think fsync() still won't work correctly since it can happen user sees
> > > > > AIO completed, calls fsync() that completes,
> > > >
> > > > hmm, does fsync() ensure user sees AIO data completed?
> > > >
> > >
> > > If we call fsync after getting AIO completion event and crash we should
> > > ensure that the data can be read back properly. That is either
> > >
> > > a) we should ensure that we convert the extent before returning the io
> > > completion event
> > > b) Or the fsync should be able to guarantee that it will force the extent
> > > conversion pending on the file.
> > Exactly.
> >
> > a) should happen in the sync-io case where we can afford to do IO from the
> > end_io callback (and due to flush_workqueue call, it happens with the
> > current Mingming's patch so that is file). But for tha aio case, we have to
> > somehow implement b).
> >
> to implement b), I think we need to keep track of a list of completed
> IOs from AIO, but not get converted extents, and force fsync to run
> flush_queue on those completed IOs?
Yes that would work fine.

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2009-09-18 22:29:47

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH 2/2 V3] allow direct IO to fallocate and holes

On Mon, 2009-09-14 at 13:25 +0200, Jan Kara wrote:
> On Thu 10-09-09 13:17:00, Mingming wrote:
> > On Thu, 2009-09-10 at 10:54 +0200, Jan Kara wrote:
> > > On Thu 10-09-09 13:27:30, Aneesh Kumar K.V wrote:
> > > > On Wed, Sep 09, 2009 at 01:51:02PM -0700, Mingming wrote:
> > > > .....
> > > > .....
> > > >
> > > > > > I think fsync() still won't work correctly since it can happen user sees
> > > > > > AIO completed, calls fsync() that completes,
> > > > >
> > > > > hmm, does fsync() ensure user sees AIO data completed?
> > > > >
> > > >
> > > > If we call fsync after getting AIO completion event and crash we should
> > > > ensure that the data can be read back properly. That is either
> > > >
> > > > a) we should ensure that we convert the extent before returning the io
> > > > completion event
> > > > b) Or the fsync should be able to guarantee that it will force the extent
> > > > conversion pending on the file.
> > > Exactly.
> > >
> > > a) should happen in the sync-io case where we can afford to do IO from the
> > > end_io callback (and due to flush_workqueue call, it happens with the
> > > current Mingming's patch so that is file). But for tha aio case, we have to
> > > somehow implement b).
> > >
> > to implement b), I think we need to keep track of a list of completed
> > IOs from AIO, but not get converted extents, and force fsync to run
> > flush_queue on those completed IOs?
> Yes that would work fine.
>
> Honza

It turns out it takes a quit effort to avoid the race between fsync()
and the work queue. But finally I think I got this right.

Here is the increamental fix. Please let me know what you think about
the approch. Thanks,
Mingming


ext4: async direct IO for holes and fallocate support

From: Mingming <[email protected]>

for async direct IO that covers holes or fallocate, the end_io
call back function now queued the convertion work on workqueue but
don't flush the work rightaway as it might take too long to afford.

But when fsyc is called after all the data is completed, user expects
the metadata also being updated before fsync returns.

Thus we need to flush the conversion work when fsync() is called.This
patch keep track of a listed of completed async direct io that has a work
queued on workqueue. When fsycn() is called, it will go through the list
and do the conversion.

Signed-off-by: Mingming Cao <[email protected]>
---
fs/ext4/ext4.h | 12 ++
fs/ext4/extents.c | 19 ++++
fs/ext4/fsync.c | 5 +
fs/ext4/inode.c | 231 +++++++++++++++++++++++++++++++++++++++++++++---------
fs/ext4/super.c | 8 +
5 files changed, 234 insertions(+), 41 deletions(-)

Index: linux-2.6.31-rc4/fs/ext4/inode.c
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/inode.c
+++ linux-2.6.31-rc4/fs/ext4/inode.c
@@ -3377,6 +3377,8 @@ static int ext4_get_block_dio_write(stru
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
int dio_credits;

+ ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
+ inode->i_ino, create);
/*
* DIO VFS code passes create = 0 flag for write to
* the middle of file. It does this to avoid block
@@ -3417,55 +3419,152 @@ out:
return ret;
}

-#define DIO_AIO 0x1
-
static void ext4_free_io_end(ext4_io_end_t *io)
{
+ BUG_ON(!io);
+ iput(io->inode);
kfree(io);
}
+static void dump_aio_dio_list(struct inode * inode)
+{
+#ifdef EXT4_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+
+ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+ ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
+ return;
+ }
+
+ ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
+ list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
+ }
+#endif
+}

/*
- * IO write completion for unwritten extents.
- *
* check a range of space and convert unwritten extents to written.
*/
-static void ext4_end_dio_unwritten(struct work_struct *work)
+static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
{
- ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
struct inode *inode = io->inode;
loff_t offset = io->offset;
size_t size = io->size;
int ret = 0;
- int aio = io->flag & DIO_AIO;

- if (aio)
- mutex_lock(&inode->i_mutex);
+ ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
+ "list->prev 0x%p\n",
+ io, inode->i_ino, io->list.next, io->list.prev);
+
+ if (list_empty(&io->list))
+ return ret;
+
+ if (io->flag != DIO_AIO_UNWRITTEN)
+ return ret;
+
if (offset + size <= i_size_read(inode))
ret = ext4_convert_unwritten_extents(inode, offset, size);

- if (ret < 0)
+ if (ret < 0) {
printk(KERN_EMERG "%s: failed to convert unwritten"
- "extents to written extents, error is %d\n",
- __func__, ret);
+ "extents to written extents, error is %d"
+ " io is still on inode %lu aio dio list\n",
+ __func__, ret, inode->i_ino);
+ return ret;
+ }
+
+ /* clear the DIO AIO unwritten flag */
+ io->flag = 0;
+ return ret;
+}
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_aio_dio_work(struct work_struct *work)
+{
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ struct inode *inode = io->inode;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+ ret = ext4_end_aio_dio_nolock(io);
+ if (ret >= 0) {
+ if (!list_empty(&io->list))
+ list_del_init(&io->list);
+ ext4_free_io_end(io);
+ }
+ mutex_unlock(&inode->i_mutex);
+}
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When AIO DIO IO is completed, the work to convert unwritten
+ * extents to written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of completed AIO from DIO path
+ * that might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents to written.
+ */
+int flush_aio_dio_completed_IO(struct inode *inode)
+{
+ ext4_io_end_t *io;
+ int ret = 0;
+ int ret2 = 0;

- ext4_free_io_end(io);
- if (aio)
- mutex_unlock(&inode->i_mutex);
+ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
+ return ret;
+
+ dump_aio_dio_list(inode);
+ while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+ io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
+ ext4_io_end_t, list);
+ /*
+ * Calling ext4_end_aio_dio_nolock() to convert completed
+ * IO to written.
+ *
+ * When ext4_sync_file() is called, run_queue() may already
+ * about to flush the work corresponding to this io structure.
+ * It will be upset if it founds the io structure related
+ * to the work-to-be schedule is freed.
+ *
+ * Thus we need to keep the io structure still valid here after
+ * convertion finished. The io structure has a flag to
+ * avoid double converting from both fsync and background work
+ * queue work.
+ */
+ ret = ext4_end_aio_dio_nolock(io);
+ if (ret < 0)
+ ret2 = ret;
+ else
+ list_del_init(&io->list);
+ }
+ return (ret2 < 0) ? ret2 : 0;
}

-static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag)
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
{
ext4_io_end_t *io = NULL;

io = kmalloc(sizeof(*io), GFP_NOFS);

if (io) {
+ igrab(inode);
io->inode = inode;
- io->flag = flag;
+ io->flag = 0;
io->offset = 0;
io->size = 0;
io->error = 0;
- INIT_WORK(&io->work, ext4_end_dio_unwritten);
+ INIT_WORK(&io->work, ext4_end_aio_dio_work);
+ INIT_LIST_HEAD(&io->list);
}

return io;
@@ -3477,19 +3576,31 @@ static void ext4_end_io_dio(struct kiocb
ext4_io_end_t *io_end = iocb->private;
struct workqueue_struct *wq;

- /* if not hole or unwritten extents, just simple return */
- if (!io_end || !size || !iocb->private)
+ ext_debug("ext4_end_io_dio(): io_end 0x%p"
+ "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+ iocb->private, io_end->inode->i_ino, iocb, offset,
+ size);
+ /* if not async direct IO or dio with 0 bytes write, just return */
+ if (!io_end || !size)
+ return;
+
+ /* if not aio dio with unwritten extents, just free io and return */
+ if (io_end->flag != DIO_AIO_UNWRITTEN){
+ ext4_free_io_end(io_end);
+ iocb->private = NULL;
return;
+ }
+
io_end->offset = offset;
io_end->size = size;
wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;

- /* We need to convert unwritten extents to written */
+ /* queue the work to convert unwritten extents to written */
queue_work(wq, &io_end->work);

- if (is_sync_kiocb(iocb))
- flush_workqueue(wq);
-
+ /* Add the io_end to per-inode completed aio dio list*/
+ list_add_tail(&io_end->list,
+ &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
iocb->private = NULL;
}
/*
@@ -3501,8 +3612,10 @@ static void ext4_end_io_dio(struct kiocb
* If those blocks were preallocated, we mark sure they are splited, but
* still keep the range to write as unintialized.
*
- * When end_io call back function called at the last IO complete time,
- * those extents will be converted to written extents.
+ * The unwrritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the convertion
+ * when async direct IO completed.
*
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
@@ -3521,28 +3634,76 @@ static ssize_t ext4_ext_direct_IO(int rw
loff_t final_size = offset + count;
if (rw == WRITE && final_size <= inode->i_size) {
/*
- * For DIO we fallocate blocks for holes, we fallocate blocks
- * The fallocated extent for hole is marked as uninitialized
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as uninitialized
* to prevent paralel buffered read to expose the stale data
* before DIO complete the data IO.
- * as for previously fallocated extents, ext4 get_block
+ *
+ * As to previously fallocated extents, ext4 get_block
* will just simply mark the buffer mapped but still
* keep the extents uninitialized.
*
- * At the end of IO, the ext4 end_io callback function
- * will convert those unwritten extents to written,
- *
+ * for non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * for async DIO, the conversion needs to be defered when
+ * the IO is completed. The ext4 end_io callback function
+ * will be called to take care of the conversion work.
+ * Here for async case, we allocate an io_end structure to
+ * hook to the iocb.
*/
- iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb));
- if (!iocb->private)
- return -ENOMEM;
+ iocb->private = NULL;
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ if (!is_sync_kiocb(iocb)) {
+ iocb->private = ext4_init_io_end(inode);
+ if (!iocb->private)
+ return -ENOMEM;
+ /*
+ * we save the io structure for current async
+ * direct IO, so that later ext4_get_blocks()
+ * could flag the io structure whether there
+ * is a unwritten extents needs to be converted
+ * when IO is completed.
+ */
+ EXT4_I(inode)->cur_aio_dio = iocb->private;
+ }
+
ret = blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block_dio_write,
ext4_end_io_dio);
+ if (iocb->private)
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ /*
+ * The io_end structure takes a reference to the inode,
+ * that structure needs to be destroyed and the
+ * reference to the inode need to be dropped, when IO is
+ * complete, even with 0 byte write, or failed.
+ *
+ * In the successful AIO DIO case, the io_end structure will be
+ * desctroyed and the reference to the inode will be dropped
+ * after the end_io call back function is called.
+ *
+ * In the case there is 0 byte write, or error case, since
+ * VFS direct IO won't invoke the end_io call back function,
+ * we need to free the end_io structure here.
+ */
+ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+ ext4_free_io_end(iocb->private);
+ iocb->private = NULL;
+ } else if (ret > 0)
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the convertion right here
+ */
+ ret = ext4_convert_unwritten_extents(inode,
+ offset, ret);
return ret;
}
+
+ /* for write the the end of file case, we fall back to old way */
return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
}

Index: linux-2.6.31-rc4/fs/ext4/ext4.h
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/ext4.h
+++ linux-2.6.31-rc4/fs/ext4/ext4.h
@@ -110,10 +110,11 @@ struct ext4_allocation_request {
/* flags. see above EXT4_MB_HINT_* */
unsigned int flags;
};
-
+#define DIO_AIO_UNWRITTEN 0x1
typedef struct ext4_io_end{
+ struct list_head list; /* per-file finished AIO list */
struct inode *inode; /* file being written to */
- unsigned int flag; /* sync IO or AIO */
+ unsigned int flag; /* unwritten or not */
int error; /* I/O error code */
ext4_lblk_t offset; /* offset in the file */
size_t size; /* size of the extent */
@@ -671,6 +672,11 @@ struct ext4_inode_info {
__u16 i_extra_isize;

spinlock_t i_block_reservation_lock;
+
+ /* completed async DIOs that might need unwritten extents handling */
+ struct list_head i_aio_dio_complete_list;
+ /* current io_end structure for async DIO write*/
+ ext4_io_end_t *cur_aio_dio;
};

/*
@@ -1399,7 +1405,7 @@ extern int ext4_block_truncate_page(hand
struct address_space *mapping, loff_t from);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t ext4_get_reserved_space(struct inode *inode);
-
+extern int flush_aio_dio_completed_IO(struct inode *inode);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
Index: linux-2.6.31-rc4/fs/ext4/super.c
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/super.c
+++ linux-2.6.31-rc4/fs/ext4/super.c
@@ -685,6 +685,8 @@ static struct inode *ext4_alloc_inode(st
ei->i_allocated_meta_blocks = 0;
ei->i_delalloc_reserved_flag = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
+ INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+ ei->cur_aio_dio = NULL;

return &ei->vfs_inode;
}
@@ -3344,11 +3346,13 @@ static int ext4_sync_fs(struct super_blo
{
int ret = 0;
tid_t target;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);

trace_ext4_sync_fs(sb, wait);
- if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+ flush_workqueue(sbi->dio_unwritten_wq);
+ if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
- jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
+ jbd2_log_wait_commit(sbi->s_journal, target);
}
return ret;
}
Index: linux-2.6.31-rc4/fs/ext4/fsync.c
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/fsync.c
+++ linux-2.6.31-rc4/fs/ext4/fsync.c
@@ -44,6 +44,8 @@
*
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
+ *
+ * i_mutex lock is hold when enter and exit this function
*/

int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
@@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, st

trace_ext4_sync_file(file, dentry, datasync);

+ ret = flush_aio_dio_completed_IO(inode);
+ if (ret < 0)
+ goto out;
/*
* data=writeback:
* The caller's filemap_fdatawrite()/wait will sync the data.
Index: linux-2.6.31-rc4/fs/ext4/extents.c
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/extents.c
+++ linux-2.6.31-rc4/fs/ext4/extents.c
@@ -3016,6 +3016,7 @@ ext4_ext_handle_uninitialized_extents(ha
{
int ret = 0;
int err = 0;
+ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;

ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
"block %llu, max_blocks %u, flags %d, allocated %u",
@@ -3028,6 +3029,9 @@ ext4_ext_handle_uninitialized_extents(ha
ret = ext4_split_unwritten_extents(handle,
inode, path, iblock,
max_blocks, flags);
+ /* flag the io_end struct that we need convert when IO done */
+ if (io)
+ io->flag = DIO_AIO_UNWRITTEN;
goto out;
}
/* DIO end_io complete, convert the filled extent to written */
@@ -3113,6 +3117,7 @@ int ext4_ext_get_blocks(handle_t *handle
int err = 0, depth, ret, cache_type;
unsigned int allocated = 0;
struct ext4_allocation_request ar;
+ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;

__clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -3262,8 +3267,20 @@ int ext4_ext_get_blocks(handle_t *handle
/* try to insert new extent into found leaf and return */
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(ar.len);
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
+ /* Mark uninitialized */
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
ext4_ext_mark_uninitialized(&newex);
+ /*
+ * io_end structure was created for every async
+ * direct IO write to the middle of the file.
+ * To avoid unecessary convertion for every aio dio rewrite
+ * to the mid of file, here we flag the IO that is really
+ * need the convertion.
+ *
+ */
+ if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ io->flag = DIO_AIO_UNWRITTEN;
+ }
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
/* free data blocks we just allocated */