Version 2 Updated patch after fixing issues with fsstress tests.
Currently DIO VFS code passes create = 0 flag for write to the middle of
file. It does this to avoid block allocation for holes, to prevent
expose stale data out when there is parallel buffered read (which does
not hold the i_mutex lock) while direct IO write has not completed. DIO
request on holes finally falls back to buffered IO for this reason.
Since preallocated extents are treated as holes when do a get_block()
look up (buffer is not mapped), thus direct IO over fallocate also falls
back to buffered IO. Thus ext4 actually silently falls back to buffered
IO in above two cases.
The basic idea to fix the direct IO on fallocate issue is to map the
fallocated extents on direct IO write before submit the IO, but wait
until the direct IO complete then convert the unwritten extent to
written extent, using an end_io call back function. We will need to
split the unwritten extents before submit the IO to prevent later
ENOSPC, and when IO complete, just mark the written part initialized.
With the end_io call back function, now it's possible to do direct IO on
holes. For ext4 extent based file, since we support fallocate, we could
fallocate blocks for holes, mark it mapped but unwritten. This way
parallel buffered IO read returns zeros when parallel DIO write to the
hole has not completed. When direct IO write complete, using the same
end_io schemem to convert those fallocated hole to written extents.
For direct IO write to the end of file, we now also could get rid of
using orphan list to protect expose stale data out after crash, when
direct write to end of file isn't complete. We now fallocate blocks for
the direct IO write to the end of file as well, and convert those
fallocated blocks at the end of file to written when IO is complete. If
fs crashed before the IO complete, it will only seen the file tail has
been fallocated but won't get the stale data out.
1) Block allocation needed for DIO write are fallocated, including holes
and file tail write, marked as unwritten extents after block allocation.
2) those unwritten extents, and fallocate extents, will be converted to
written extents (and update disk size when write to end of file)when the
IO is complete. The conversion is triggered using end_io call back
function passing from ext4 fs to direct IO.
3) For already fallocated extent, at the time try to map the fallocated
extent, we will split the fallocated extent as necessary, mark the
to-write fallocated extent mapped but still remains unwritten,
insert the splitted extents, to prevent ENOSPC later.
This first patch does 1) and 2), the second patch does 3)
Patch against ext4 patch queue.
Comments?
Singed-Off-By: Mingming Cao <[email protected]>
---
fs/ext4/ext4.h | 18 ++++
fs/ext4/inode.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
fs/ext4/super.c | 11 ++
3 files changed, 237 insertions(+), 3 deletions(-)
Index: linux-2.6.31-rc4/fs/ext4/ext4.h
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/ext4.h 2009-08-09 14:46:10.000000000 -0700
+++ linux-2.6.31-rc4/fs/ext4/ext4.h 2009-08-09 23:13:15.000000000 -0700
@@ -111,6 +111,15 @@ struct ext4_allocation_request {
unsigned int flags;
};
+typedef struct ext4_io_end{
+ struct inode *inode; /* file being written to */
+ unsigned int type; /* unwritten or written */
+ int error; /* I/O error code */
+ ext4_lblk_t offset; /* offset in the file */
+ size_t size; /* size of the extent */
+ struct work_struct work; /* data work queue */
+}ext4_io_end_t;
+
/*
* Special inodes numbers
*/
@@ -330,8 +339,8 @@ struct ext4_new_group_data {
/* Call ext4_da_update_reserve_space() after successfully
allocating the blocks */
#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
-
-
+#define EXT4_GET_BLOCKS_DIO_CREATE_EXT 0x0011
+#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT 0x0021
/*
* ioctl commands
*/
@@ -960,6 +969,9 @@ struct ext4_sb_info {
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
+
+ /* workqueue for dio unwritten */
+ struct workqueue_struct *dio_unwritten_wq;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1650,6 +1662,8 @@ extern void ext4_ext_init(struct super_b
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
loff_t len);
+extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ loff_t len);
extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
sector_t block, unsigned int max_blocks,
struct buffer_head *bh, int flags);
Index: linux-2.6.31-rc4/fs/ext4/super.c
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/super.c 2009-08-09 14:51:08.000000000 -0700
+++ linux-2.6.31-rc4/fs/ext4/super.c 2009-08-09 14:51:14.000000000 -0700
@@ -578,6 +578,9 @@ static void ext4_put_super(struct super_
struct ext4_super_block *es = sbi->s_es;
int i, err;
+ flush_workqueue(sbi->dio_unwritten_wq);
+ destroy_workqueue(sbi->dio_unwritten_wq);
+
lock_super(sb);
lock_kernel();
if (sb->s_dirt)
@@ -2781,6 +2784,12 @@ no_journal:
clear_opt(sbi->s_mount_opt, NOBH);
}
}
+ EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+ if (!EXT4_SB(sb)->dio_unwritten_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+ goto failed_mount_wq;
+ }
+
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
@@ -2893,6 +2902,8 @@ cantfind_ext4:
failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
+ destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+failed_mount_wq:
ext4_release_system_zone(sb);
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
Index: linux-2.6.31-rc4/fs/ext4/inode.c
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/inode.c 2009-08-09 14:46:32.000000000 -0700
+++ linux-2.6.31-rc4/fs/ext4/inode.c 2009-08-09 14:56:40.000000000 -0700
@@ -37,6 +37,7 @@
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
+#include <linux/workqueue.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -3279,6 +3280,8 @@ static int ext4_releasepage(struct page
}
/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
* if the machine crashes during the write.
@@ -3287,7 +3290,7 @@ static int ext4_releasepage(struct page
* crashes then stale disk data _may_ be exposed inside the file. But current
* VFS code falls back into buffered path in that case so we are safe.
*/
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
@@ -3361,6 +3364,212 @@ out:
return ret;
}
+struct workqueue_struct *ext4_unwritten_queue;
+
+/* Maximum number of blocks we map for direct IO at once. */
+
+static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ handle_t *handle = NULL;
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int dio_credits;
+
+ /*
+ * DIO VFS code passes create = 0 flag for write to
+ * the middle of file. It does this to avoid block
+ * allocation for holes, to prevent expose stale data
+ * out when there is parallel buffered read (which does
+ * not hold the i_mutex lock) while direct IO write has
+ * not completed. DIO request on holes finally falls back
+ * to buffered IO for this reason.
+ *
+ * For ext4 extent based file, since we support fallocate,
+ * new allocated extent as uninitialized, for holes, we
+ * could fallocate blocks for holes, thus parallel
+ * buffered IO read will zero out the page when read on
+ * a hole while parallel DIO write to the hole has not completed.
+ *
+ * when we come here, we know it's a direct IO write,
+ * so it's safe to override the create flag from VFS.
+ */
+ create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+
+ if (max_blocks > DIO_MAX_BLOCKS)
+ max_blocks = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ handle = ext4_journal_start(inode, dio_credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+ create);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ ext4_journal_stop(handle);
+out:
+ return ret;
+}
+
+static int ext4_get_block_dio_read(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ handle_t *handle = NULL;
+
+ ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+ create);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ return ret;
+}
+
+
+#define DIO_UNWRITTEN 0x1
+
+/*
+ * IO write completion for unwritten extents.
+ *
+ * check a range of space and convert unwritten extents to written.
+ */
+static void ext4_end_dio_unwritten(struct work_struct *work)
+{
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ struct inode *inode = io->inode;
+ loff_t offset = io->offset;
+ size_t size = io->size;
+ int ret = 0;
+
+ ret = ext4_convert_unwritten_extents(inode, offset, size);
+
+ if (ret < 0)
+ printk(KERN_EMERG "%s: failed to convert unwritten"
+ "extents to written extents, error is %d\n",
+ __func__, ret);
+ kfree(io);
+}
+
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int type)
+{
+ ext4_io_end_t *io = NULL;
+
+ io = kmalloc(sizeof(*io), GFP_NOFS);
+
+ if (io) {
+ io->inode = inode;
+ io->type = type;
+ io->offset = 0;
+ io->size = 0;
+ io->error = 0;
+ INIT_WORK(&io->work, ext4_end_dio_unwritten);
+ }
+
+ return io;
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private)
+{
+ ext4_io_end_t *io_end = iocb->private;
+ struct workqueue_struct *wq;
+
+ /* if not hole or unwritten extents, just simple return */
+ if (!io_end || !size)
+ return;
+ io_end->offset = offset;
+ io_end->size = size;
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+ /* We need to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+
+ if (is_sync_kiocb(iocb))
+ flush_workqueue(wq);
+
+ iocb->private = NULL;
+}
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * If there is block allocation needed(holes, EOF write), we fallocate
+ * those blocks, mark them as unintialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as unintialized.
+ *
+ * When end_io call back function called at the last IO complete time,
+ * those extents will be converted to written extents.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+
+ if (rw == WRITE) {
+ /*
+ * For DIO we fallocate blocks for holes and end of file
+ * write. Those fallocated extents are marked as uninitialized
+ * to prevent paralel buffered read to expose the stale data
+ * before DIO complete the data IO.
+ * as for previously fallocated extents, ext4 get_block
+ * will just simply mark the buffer mapped but still
+ * keep the extents uninitialized.
+ *
+ * At the end of IO, the ext4 end_io callback function
+ * will convert those unwritten extents to written,
+ * and update on disk file size if the DIO expands the file.
+ *
+ */
+ iocb->private = ext4_init_io_end(inode, DIO_UNWRITTEN);
+ if (!iocb->private)
+ return -ENOMEM;
+
+ ret = blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_dio_write,
+ ext4_end_io_dio);
+ } else
+ ret = blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_dio_read, NULL);
+
+ /*
+ * In the case of AIO DIO, VFS dio submitted the IO, but it
+ * does not wait for io complete. To prevent expose stale
+ * data after crash before IO complete,
+ * i_disksize needs to be updated at the
+ * time all the IO is completed, not here
+ */
+ return ret;
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
/*
* Pages can be marked dirty completely asynchronously from ext4's journalling
* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
Hi Mingming,
> Version 2 Updated patch after fixing issues with fsstress tests.
Probably it would make sence to reorder patches so that first comes
extent rewriting code and then in the second patch you use it from DIO
path. This way you call a function which is not there which is strange
and it can break bisecting the kernel for no good reason.
> Currently DIO VFS code passes create = 0 flag for write to the middle of
> file. It does this to avoid block allocation for holes, to prevent
> expose stale data out when there is parallel buffered read (which does
> not hold the i_mutex lock) while direct IO write has not completed. DIO
> request on holes finally falls back to buffered IO for this reason.
>
> Since preallocated extents are treated as holes when do a get_block()
> look up (buffer is not mapped), thus direct IO over fallocate also falls
> back to buffered IO. Thus ext4 actually silently falls back to buffered
> IO in above two cases.
>
> The basic idea to fix the direct IO on fallocate issue is to map the
> fallocated extents on direct IO write before submit the IO, but wait
> until the direct IO complete then convert the unwritten extent to
> written extent, using an end_io call back function. We will need to
> split the unwritten extents before submit the IO to prevent later
> ENOSPC, and when IO complete, just mark the written part initialized.
>
> With the end_io call back function, now it's possible to do direct IO on
> holes. For ext4 extent based file, since we support fallocate, we could
> fallocate blocks for holes, mark it mapped but unwritten. This way
> parallel buffered IO read returns zeros when parallel DIO write to the
> hole has not completed. When direct IO write complete, using the same
> end_io schemem to convert those fallocated hole to written extents.
Admittedly, I don't think this is the best solution... It can be done
but there are quite some corner cases to watch. See my comments in the
patch below.
> For direct IO write to the end of file, we now also could get rid of
> using orphan list to protect expose stale data out after crash, when
> direct write to end of file isn't complete. We now fallocate blocks for
> the direct IO write to the end of file as well, and convert those
> fallocated blocks at the end of file to written when IO is complete. If
> fs crashed before the IO complete, it will only seen the file tail has
> been fallocated but won't get the stale data out.
But you still probably need orphan list to truncate blocks allocated
beyond file end during extending direct write. So I'd just remove this
paragraph...
> 1) Block allocation needed for DIO write are fallocated, including holes
> and file tail write, marked as unwritten extents after block allocation.
>
> 2) those unwritten extents, and fallocate extents, will be converted to
> written extents (and update disk size when write to end of file)when the
> IO is complete. The conversion is triggered using end_io call back
> function passing from ext4 fs to direct IO.
>
> 3) For already fallocated extent, at the time try to map the fallocated
> extent, we will split the fallocated extent as necessary, mark the
> to-write fallocated extent mapped but still remains unwritten,
> insert the splitted extents, to prevent ENOSPC later.
>
> This first patch does 1) and 2), the second patch does 3)
>
> Patch against ext4 patch queue.
>
> Comments?
>
> Singed-Off-By: Mingming Cao <[email protected]>
> ---
> fs/ext4/ext4.h | 18 ++++
> fs/ext4/inode.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> fs/ext4/super.c | 11 ++
> 3 files changed, 237 insertions(+), 3 deletions(-)
>
> Index: linux-2.6.31-rc4/fs/ext4/ext4.h
> ===================================================================
> --- linux-2.6.31-rc4.orig/fs/ext4/ext4.h 2009-08-09 14:46:10.000000000 -0700
> +++ linux-2.6.31-rc4/fs/ext4/ext4.h 2009-08-09 23:13:15.000000000 -0700
> @@ -111,6 +111,15 @@ struct ext4_allocation_request {
> unsigned int flags;
> };
>
> +typedef struct ext4_io_end{
> + struct inode *inode; /* file being written to */
> + unsigned int type; /* unwritten or written */
> + int error; /* I/O error code */
> + ext4_lblk_t offset; /* offset in the file */
> + size_t size; /* size of the extent */
> + struct work_struct work; /* data work queue */
> +}ext4_io_end_t;
> +
> /*
> * Special inodes numbers
> */
> @@ -330,8 +339,8 @@ struct ext4_new_group_data {
> /* Call ext4_da_update_reserve_space() after successfully
> allocating the blocks */
> #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
> -
> -
> +#define EXT4_GET_BLOCKS_DIO_CREATE_EXT 0x0011
> +#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT 0x0021
> /*
> * ioctl commands
> */
> @@ -960,6 +969,9 @@ struct ext4_sb_info {
>
> unsigned int s_log_groups_per_flex;
> struct flex_groups *s_flex_groups;
> +
> + /* workqueue for dio unwritten */
> + struct workqueue_struct *dio_unwritten_wq;
> };
>
> static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> @@ -1650,6 +1662,8 @@ extern void ext4_ext_init(struct super_b
> extern void ext4_ext_release(struct super_block *);
> extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
> loff_t len);
> +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
> + loff_t len);
> extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
> sector_t block, unsigned int max_blocks,
> struct buffer_head *bh, int flags);
> Index: linux-2.6.31-rc4/fs/ext4/super.c
> ===================================================================
> --- linux-2.6.31-rc4.orig/fs/ext4/super.c 2009-08-09 14:51:08.000000000 -0700
> +++ linux-2.6.31-rc4/fs/ext4/super.c 2009-08-09 14:51:14.000000000 -0700
> @@ -578,6 +578,9 @@ static void ext4_put_super(struct super_
> struct ext4_super_block *es = sbi->s_es;
> int i, err;
>
> + flush_workqueue(sbi->dio_unwritten_wq);
> + destroy_workqueue(sbi->dio_unwritten_wq);
> +
> lock_super(sb);
> lock_kernel();
> if (sb->s_dirt)
> @@ -2781,6 +2784,12 @@ no_journal:
> clear_opt(sbi->s_mount_opt, NOBH);
> }
> }
> + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
> + if (!EXT4_SB(sb)->dio_unwritten_wq) {
> + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
> + goto failed_mount_wq;
> + }
> +
> /*
> * The jbd2_journal_load will have done any necessary log recovery,
> * so we can safely mount the rest of the filesystem now.
> @@ -2893,6 +2902,8 @@ cantfind_ext4:
>
> failed_mount4:
> ext4_msg(sb, KERN_ERR, "mount failed");
> + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
> +failed_mount_wq:
> ext4_release_system_zone(sb);
> if (sbi->s_journal) {
> jbd2_journal_destroy(sbi->s_journal);
> Index: linux-2.6.31-rc4/fs/ext4/inode.c
> ===================================================================
> --- linux-2.6.31-rc4.orig/fs/ext4/inode.c 2009-08-09 14:46:32.000000000 -0700
> +++ linux-2.6.31-rc4/fs/ext4/inode.c 2009-08-09 14:56:40.000000000 -0700
> @@ -37,6 +37,7 @@
> #include <linux/namei.h>
> #include <linux/uio.h>
> #include <linux/bio.h>
> +#include <linux/workqueue.h>
>
> #include "ext4_jbd2.h"
> #include "xattr.h"
> @@ -3279,6 +3280,8 @@ static int ext4_releasepage(struct page
> }
>
> /*
> + * O_DIRECT for ext3 (or indirect map) based files
> + *
> * If the O_DIRECT write will extend the file then add this inode to the
> * orphan list. So recovery will truncate it back to the original size
> * if the machine crashes during the write.
> @@ -3287,7 +3290,7 @@ static int ext4_releasepage(struct page
> * crashes then stale disk data _may_ be exposed inside the file. But current
> * VFS code falls back into buffered path in that case so we are safe.
> */
> -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
> +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
> const struct iovec *iov, loff_t offset,
> unsigned long nr_segs)
> {
> @@ -3361,6 +3364,212 @@ out:
> return ret;
> }
>
> +struct workqueue_struct *ext4_unwritten_queue;
> +
> +/* Maximum number of blocks we map for direct IO at once. */
> +
> +static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
> + struct buffer_head *bh_result, int create)
> +{
> + handle_t *handle = NULL;
> + int ret = 0;
> + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
> + int dio_credits;
> +
> + /*
> + * DIO VFS code passes create = 0 flag for write to
> + * the middle of file. It does this to avoid block
> + * allocation for holes, to prevent expose stale data
> + * out when there is parallel buffered read (which does
> + * not hold the i_mutex lock) while direct IO write has
> + * not completed. DIO request on holes finally falls back
> + * to buffered IO for this reason.
> + *
> + * For ext4 extent based file, since we support fallocate,
> + * new allocated extent as uninitialized, for holes, we
> + * could fallocate blocks for holes, thus parallel
> + * buffered IO read will zero out the page when read on
> + * a hole while parallel DIO write to the hole has not completed.
> + *
> + * when we come here, we know it's a direct IO write,
> + * so it's safe to override the create flag from VFS.
> + */
> + create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
> +
> + if (max_blocks > DIO_MAX_BLOCKS)
> + max_blocks = DIO_MAX_BLOCKS;
> + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
> + handle = ext4_journal_start(inode, dio_credits);
> + if (IS_ERR(handle)) {
> + ret = PTR_ERR(handle);
> + goto out;
> + }
> + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
> + create);
> + if (ret > 0) {
> + bh_result->b_size = (ret << inode->i_blkbits);
> + ret = 0;
> + }
> + ext4_journal_stop(handle);
> +out:
> + return ret;
> +}
> +
> +static int ext4_get_block_dio_read(struct inode *inode, sector_t iblock,
> + struct buffer_head *bh_result, int create)
> +{
> + int ret = 0;
> + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
> + handle_t *handle = NULL;
> +
> + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
> + create);
> + if (ret > 0) {
> + bh_result->b_size = (ret << inode->i_blkbits);
> + ret = 0;
> + }
> + return ret;
> +}
Huh, what's the purpose of the above function? We can use normal
get_block, can't we?
> +
> +
> +#define DIO_UNWRITTEN 0x1
> +
> +/*
> + * IO write completion for unwritten extents.
> + *
> + * check a range of space and convert unwritten extents to written.
> + */
> +static void ext4_end_dio_unwritten(struct work_struct *work)
> +{
> + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> + struct inode *inode = io->inode;
> + loff_t offset = io->offset;
> + size_t size = io->size;
> + int ret = 0;
> +
> + ret = ext4_convert_unwritten_extents(inode, offset, size);
> +
> + if (ret < 0)
> + printk(KERN_EMERG "%s: failed to convert unwritten"
> + "extents to written extents, error is %d\n",
> + __func__, ret);
> + kfree(io);
> +}
Looking at ext4_convert_unwritten_extents(), you definitely miss some
locking. Since this is called completely asynchronously, you have to
protect against racing truncates and basically anything can happen with
the inode in the mean time. It needn't be cached in the memory anymore!
Also fsync() has to flush all the updates for the inode it has in the
workqueue... Ditto for ext4_sync_fs().
> +
> +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int type)
> +{
> + ext4_io_end_t *io = NULL;
> +
> + io = kmalloc(sizeof(*io), GFP_NOFS);
> +
> + if (io) {
> + io->inode = inode;
> + io->type = type;
> + io->offset = 0;
> + io->size = 0;
> + io->error = 0;
> + INIT_WORK(&io->work, ext4_end_dio_unwritten);
> + }
> +
> + return io;
> +}
> +
> +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> + ssize_t size, void *private)
> +{
> + ext4_io_end_t *io_end = iocb->private;
> + struct workqueue_struct *wq;
> +
> + /* if not hole or unwritten extents, just simple return */
> + if (!io_end || !size)
> + return;
> + io_end->offset = offset;
> + io_end->size = size;
> + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> +
> + /* We need to convert unwritten extents to written */
> + queue_work(wq, &io_end->work);
> +
> + if (is_sync_kiocb(iocb))
> + flush_workqueue(wq);
I don't think you can flush_workqueue here. end_io is called from
interrupt context and flush_workqueue blocks for a long time...
The wait should be done in ext4_direct_IO IMHO...
> +
> + iocb->private = NULL;
> +}
> +/*
> + * For ext4 extent files, ext4 will do direct-io write to holes,
> + * preallocated extents, and those write extend the file, no need to
> + * fall back to buffered IO.
> + *
> + * If there is block allocation needed(holes, EOF write), we fallocate
> + * those blocks, mark them as unintialized
> + * If those blocks were preallocated, we mark sure they are splited, but
> + * still keep the range to write as unintialized.
> + *
> + * When end_io call back function called at the last IO complete time,
> + * those extents will be converted to written extents.
> + *
> + */
> +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> + const struct iovec *iov, loff_t offset,
> + unsigned long nr_segs)
> +{
> + struct file *file = iocb->ki_filp;
> + struct inode *inode = file->f_mapping->host;
> + ssize_t ret;
> +
> + if (rw == WRITE) {
> + /*
> + * For DIO we fallocate blocks for holes and end of file
> + * write. Those fallocated extents are marked as uninitialized
> + * to prevent paralel buffered read to expose the stale data
> + * before DIO complete the data IO.
> + * as for previously fallocated extents, ext4 get_block
> + * will just simply mark the buffer mapped but still
> + * keep the extents uninitialized.
> + *
> + * At the end of IO, the ext4 end_io callback function
> + * will convert those unwritten extents to written,
> + * and update on disk file size if the DIO expands the file.
> + *
> + */
> + iocb->private = ext4_init_io_end(inode, DIO_UNWRITTEN);
> + if (!iocb->private)
> + return -ENOMEM;
> +
> + ret = blockdev_direct_IO(rw, iocb, inode,
> + inode->i_sb->s_bdev, iov,
> + offset, nr_segs,
> + ext4_get_block_dio_write,
> + ext4_end_io_dio);
> + } else
> + ret = blockdev_direct_IO(rw, iocb, inode,
> + inode->i_sb->s_bdev, iov,
> + offset, nr_segs,
> + ext4_get_block_dio_read, NULL);
> +
> + /*
> + * In the case of AIO DIO, VFS dio submitted the IO, but it
> + * does not wait for io complete. To prevent expose stale
> + * data after crash before IO complete,
> + * i_disksize needs to be updated at the
> + * time all the IO is completed, not here
> + */
Yeah, but so far at least ext3_direct_IO and ext4_ind_direct_IO
happily update i_size here and noone reported the problem (yet) ;). The
thing is that the current transaction has to commit for stale data to be
visible and that sends a barrier which also forces blocks written by
direct IO to the persistent storage. So at least if the underlying
storage supports barriers, we are fine. If it does not, it could maybe
reorder direct IO writes after the journal commit (it need not have
reported the direct IO as done so far) and after a crash we would have
a problem...
It's just that I'm not sure that all the trouble with end_io and
workqueues is worth it... More code = more bugs ;)
If we decided that we care enough to be really sure that we cannot
expose dirty data in AIO DIO, we could just do some trick like have a
list of running dios attached to the inode (protected by some spinlock)
and wait for them during the transaction commit (we'd have to add a
commit trigger for an inode but that's trivial).
> + return ret;
> +}
> +
> +static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
> + const struct iovec *iov, loff_t offset,
> + unsigned long nr_segs)
> +{
> + struct file *file = iocb->ki_filp;
> + struct inode *inode = file->f_mapping->host;
> +
> + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
> + return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
> +
> + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
> +}
> +
> /*
> * Pages can be marked dirty completely asynchronously from ext4's journalling
> * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
Honza
--
Jan Kara <[email protected]>
SuSE CR Labs
On Wed, 2009-08-19 at 16:15 +0200, Jan Kara wrote:
> Hi Mingming,
>
Hello Jan,
Thanks for spending time review this patch.
> > Version 2 Updated patch after fixing issues with fsstress tests.
> Probably it would make sence to reorder patches so that first comes
> extent rewriting code and then in the second patch you use it from DIO
> path. This way you call a function which is not there which is strange
> and it can break bisecting the kernel for no good reason.
>
Good idea, I will re-order the patch.
> > Currently DIO VFS code passes create = 0 flag for write to the middle of
> > file. It does this to avoid block allocation for holes, to prevent
> > expose stale data out when there is parallel buffered read (which does
> > not hold the i_mutex lock) while direct IO write has not completed. DIO
> > request on holes finally falls back to buffered IO for this reason.
> >
> > Since preallocated extents are treated as holes when do a get_block()
> > look up (buffer is not mapped), thus direct IO over fallocate also falls
> > back to buffered IO. Thus ext4 actually silently falls back to buffered
> > IO in above two cases.
> >
> > The basic idea to fix the direct IO on fallocate issue is to map the
> > fallocated extents on direct IO write before submit the IO, but wait
> > until the direct IO complete then convert the unwritten extent to
> > written extent, using an end_io call back function. We will need to
> > split the unwritten extents before submit the IO to prevent later
> > ENOSPC, and when IO complete, just mark the written part initialized.
> >
> > With the end_io call back function, now it's possible to do direct IO on
> > holes. For ext4 extent based file, since we support fallocate, we could
> > fallocate blocks for holes, mark it mapped but unwritten. This way
> > parallel buffered IO read returns zeros when parallel DIO write to the
> > hole has not completed. When direct IO write complete, using the same
> > end_io schemem to convert those fallocated hole to written extents.
> Admittedly, I don't think this is the best solution... It can be done
> but there are quite some corner cases to watch. See my comments in the
> patch below.
>
Okay, are you worried about using unwritten extent and end_io call back
function for direct write to holes? Or to preallocated space? I thought
it's pretty straighforward for these two cases...
> > For direct IO write to the end of file, we now also could get rid of
> > using orphan list to protect expose stale data out after crash, when
> > direct write to end of file isn't complete. We now fallocate blocks for
> > the direct IO write to the end of file as well, and convert those
> > fallocated blocks at the end of file to written when IO is complete. If
> > fs crashed before the IO complete, it will only seen the file tail has
> > been fallocated but won't get the stale data out.
> But you still probably need orphan list to truncate blocks allocated
> beyond file end during extending direct write. So I'd just remove this
> paragraph...
>
Do we still need to truncate blocks allocated at the end of file? Those
blocks are marked as uninitialized extents (as any block allocation from
DIO are flagged as uninitialized extents, before IO is complete), so
that after recover from crash, the stale data won't get exposed.
I guess I missed the cases you concerned that we need the orphan list to
protect, could you plain a little more?
> > 1) Block allocation needed for DIO write are fallocated, including holes
> > and file tail write, marked as unwritten extents after block allocation.
> >
> > 2) those unwritten extents, and fallocate extents, will be converted to
> > written extents (and update disk size when write to end of file)when the
> > IO is complete. The conversion is triggered using end_io call back
> > function passing from ext4 fs to direct IO.
> >
> > 3) For already fallocated extent, at the time try to map the fallocated
> > extent, we will split the fallocated extent as necessary, mark the
> > to-write fallocated extent mapped but still remains unwritten,
> > insert the splitted extents, to prevent ENOSPC later.
> >
> > This first patch does 1) and 2), the second patch does 3)
> >
> > Patch against ext4 patch queue.
> >
> > Comments?
> >
> > Singed-Off-By: Mingming Cao <[email protected]>
> > ---
> > fs/ext4/ext4.h | 18 ++++
> > fs/ext4/inode.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > fs/ext4/super.c | 11 ++
> > 3 files changed, 237 insertions(+), 3 deletions(-)
> >
> > Index: linux-2.6.31-rc4/fs/ext4/ext4.h
> > ===================================================================
> > --- linux-2.6.31-rc4.orig/fs/ext4/ext4.h 2009-08-09 14:46:10.000000000 -0700
> > +++ linux-2.6.31-rc4/fs/ext4/ext4.h 2009-08-09 23:13:15.000000000 -0700
> > @@ -111,6 +111,15 @@ struct ext4_allocation_request {
> > unsigned int flags;
> > };
> >
> > +typedef struct ext4_io_end{
> > + struct inode *inode; /* file being written to */
> > + unsigned int type; /* unwritten or written */
> > + int error; /* I/O error code */
> > + ext4_lblk_t offset; /* offset in the file */
> > + size_t size; /* size of the extent */
> > + struct work_struct work; /* data work queue */
> > +}ext4_io_end_t;
> > +
> > /*
> > * Special inodes numbers
> > */
> > @@ -330,8 +339,8 @@ struct ext4_new_group_data {
> > /* Call ext4_da_update_reserve_space() after successfully
> > allocating the blocks */
> > #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
> > -
> > -
> > +#define EXT4_GET_BLOCKS_DIO_CREATE_EXT 0x0011
> > +#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT 0x0021
> > /*
> > * ioctl commands
> > */
> > @@ -960,6 +969,9 @@ struct ext4_sb_info {
> >
> > unsigned int s_log_groups_per_flex;
> > struct flex_groups *s_flex_groups;
> > +
> > + /* workqueue for dio unwritten */
> > + struct workqueue_struct *dio_unwritten_wq;
> > };
> >
> > static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> > @@ -1650,6 +1662,8 @@ extern void ext4_ext_init(struct super_b
> > extern void ext4_ext_release(struct super_block *);
> > extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
> > loff_t len);
> > +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
> > + loff_t len);
> > extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
> > sector_t block, unsigned int max_blocks,
> > struct buffer_head *bh, int flags);
> > Index: linux-2.6.31-rc4/fs/ext4/super.c
> > ===================================================================
> > --- linux-2.6.31-rc4.orig/fs/ext4/super.c 2009-08-09 14:51:08.000000000 -0700
> > +++ linux-2.6.31-rc4/fs/ext4/super.c 2009-08-09 14:51:14.000000000 -0700
> > @@ -578,6 +578,9 @@ static void ext4_put_super(struct super_
> > struct ext4_super_block *es = sbi->s_es;
> > int i, err;
> >
> > + flush_workqueue(sbi->dio_unwritten_wq);
> > + destroy_workqueue(sbi->dio_unwritten_wq);
> > +
> > lock_super(sb);
> > lock_kernel();
> > if (sb->s_dirt)
> > @@ -2781,6 +2784,12 @@ no_journal:
> > clear_opt(sbi->s_mount_opt, NOBH);
> > }
> > }
> > + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
> > + if (!EXT4_SB(sb)->dio_unwritten_wq) {
> > + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
> > + goto failed_mount_wq;
> > + }
> > +
> > /*
> > * The jbd2_journal_load will have done any necessary log recovery,
> > * so we can safely mount the rest of the filesystem now.
> > @@ -2893,6 +2902,8 @@ cantfind_ext4:
> >
> > failed_mount4:
> > ext4_msg(sb, KERN_ERR, "mount failed");
> > + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
> > +failed_mount_wq:
> > ext4_release_system_zone(sb);
> > if (sbi->s_journal) {
> > jbd2_journal_destroy(sbi->s_journal);
> > Index: linux-2.6.31-rc4/fs/ext4/inode.c
> > ===================================================================
> > --- linux-2.6.31-rc4.orig/fs/ext4/inode.c 2009-08-09 14:46:32.000000000 -0700
> > +++ linux-2.6.31-rc4/fs/ext4/inode.c 2009-08-09 14:56:40.000000000 -0700
> > @@ -37,6 +37,7 @@
> > #include <linux/namei.h>
> > #include <linux/uio.h>
> > #include <linux/bio.h>
> > +#include <linux/workqueue.h>
> >
> > #include "ext4_jbd2.h"
> > #include "xattr.h"
> > @@ -3279,6 +3280,8 @@ static int ext4_releasepage(struct page
> > }
> >
> > /*
> > + * O_DIRECT for ext3 (or indirect map) based files
> > + *
> > * If the O_DIRECT write will extend the file then add this inode to the
> > * orphan list. So recovery will truncate it back to the original size
> > * if the machine crashes during the write.
> > @@ -3287,7 +3290,7 @@ static int ext4_releasepage(struct page
> > * crashes then stale disk data _may_ be exposed inside the file. But current
> > * VFS code falls back into buffered path in that case so we are safe.
> > */
> > -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
> > +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
> > const struct iovec *iov, loff_t offset,
> > unsigned long nr_segs)
> > {
> > @@ -3361,6 +3364,212 @@ out:
> > return ret;
> > }
> >
> > +struct workqueue_struct *ext4_unwritten_queue;
> > +
> > +/* Maximum number of blocks we map for direct IO at once. */
> > +
> > +static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
> > + struct buffer_head *bh_result, int create)
> > +{
> > + handle_t *handle = NULL;
> > + int ret = 0;
> > + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
> > + int dio_credits;
> > +
> > + /*
> > + * DIO VFS code passes create = 0 flag for write to
> > + * the middle of file. It does this to avoid block
> > + * allocation for holes, to prevent expose stale data
> > + * out when there is parallel buffered read (which does
> > + * not hold the i_mutex lock) while direct IO write has
> > + * not completed. DIO request on holes finally falls back
> > + * to buffered IO for this reason.
> > + *
> > + * For ext4 extent based file, since we support fallocate,
> > + * new allocated extent as uninitialized, for holes, we
> > + * could fallocate blocks for holes, thus parallel
> > + * buffered IO read will zero out the page when read on
> > + * a hole while parallel DIO write to the hole has not completed.
> > + *
> > + * when we come here, we know it's a direct IO write,
> > + * so it's safe to override the create flag from VFS.
> > + */
> > + create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
> > +
> > + if (max_blocks > DIO_MAX_BLOCKS)
> > + max_blocks = DIO_MAX_BLOCKS;
> > + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
> > + handle = ext4_journal_start(inode, dio_credits);
> > + if (IS_ERR(handle)) {
> > + ret = PTR_ERR(handle);
> > + goto out;
> > + }
> > + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
> > + create);
> > + if (ret > 0) {
> > + bh_result->b_size = (ret << inode->i_blkbits);
> > + ret = 0;
> > + }
> > + ext4_journal_stop(handle);
> > +out:
> > + return ret;
> > +}
> > +
> > +static int ext4_get_block_dio_read(struct inode *inode, sector_t iblock,
> > + struct buffer_head *bh_result, int create)
> > +{
> > + int ret = 0;
> > + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
> > + handle_t *handle = NULL;
> > +
> > + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
> > + create);
> > + if (ret > 0) {
> > + bh_result->b_size = (ret << inode->i_blkbits);
> > + ret = 0;
> > + }
> > + return ret;
> > +}
> Huh, what's the purpose of the above function? We can use normal
> get_block, can't we?
>
This is pretty much a wrapper of this normal get_block function, except
here we need to store the space mapped in b_size, DIO will check that.
> > +
> > +
> > +#define DIO_UNWRITTEN 0x1
> > +
> > +/*
> > + * IO write completion for unwritten extents.
> > + *
> > + * check a range of space and convert unwritten extents to written.
> > + */
> > +static void ext4_end_dio_unwritten(struct work_struct *work)
> > +{
> > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > + struct inode *inode = io->inode;
> > + loff_t offset = io->offset;
> > + size_t size = io->size;
> > + int ret = 0;
> > +
> > + ret = ext4_convert_unwritten_extents(inode, offset, size);
> > +
> > + if (ret < 0)
> > + printk(KERN_EMERG "%s: failed to convert unwritten"
> > + "extents to written extents, error is %d\n",
> > + __func__, ret);
> > + kfree(io);
> > +}
> Looking at ext4_convert_unwritten_extents(), you definitely miss some
> locking. Since this is called completely asynchronously, you have to
> protect against racing truncates and basically anything can happen with
> the inode in the mean time.
extents tree update is protected by the i_data_sem which will be hold at
the ext4_get_blocks() called from ext4_convert_unwritten_extents.
perhaps should grab the i_mutex() which protects the inode update?
> It needn't be cached in the memory anymore!
Right, we probably need to increase the reference to the inode before
submit the IO, so the inode would not be push out of cache before IO
completed.
> Also fsync() has to flush all the updates for the inode it has in the
> workqueue... Ditto for ext4_sync_fs().
>
I think we discussed about this before, and it seems there is no clear
defination the DIO forces metadata update sync to disk before returns
back to user apps... If we do force this in ext4 &DIO, doing fsync() on
every DIO call is expensive.
If file is opened with sync node, then we need to flush all updates for
inode it has done with DIO. This is what currently ext3 dio does, inode
update (mtime etc).
> > +
> > +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int type)
> > +{
> > + ext4_io_end_t *io = NULL;
> > +
> > + io = kmalloc(sizeof(*io), GFP_NOFS);
> > +
> > + if (io) {
> > + io->inode = inode;
> > + io->type = type;
> > + io->offset = 0;
> > + io->size = 0;
> > + io->error = 0;
> > + INIT_WORK(&io->work, ext4_end_dio_unwritten);
> > + }
> > +
> > + return io;
> > +}
> > +
> > +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> > + ssize_t size, void *private)
> > +{
> > + ext4_io_end_t *io_end = iocb->private;
> > + struct workqueue_struct *wq;
> > +
> > + /* if not hole or unwritten extents, just simple return */
> > + if (!io_end || !size)
> > + return;
> > + io_end->offset = offset;
> > + io_end->size = size;
> > + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> > +
> > + /* We need to convert unwritten extents to written */
> > + queue_work(wq, &io_end->work);
> > +
> > + if (is_sync_kiocb(iocb))
> > + flush_workqueue(wq);
> I don't think you can flush_workqueue here. end_io is called from
> interrupt context and flush_workqueue blocks for a long time...
> The wait should be done in ext4_direct_IO IMHO...
>
Okay, I will move it to ext4_direct_IO(), hmm..I think that is fine with
AIO path as the is_sync_kiocb(iocb) will avoid that.
If flush workqueue just kick off the work on the workqueue but not wait
for it to complete (no fsync()), would that still be a big concern? BTW,
the flush workqueue only called when file is opened with sync mode.
> > +
> > + iocb->private = NULL;
> > +}
> > +/*
> > + * For ext4 extent files, ext4 will do direct-io write to holes,
> > + * preallocated extents, and those write extend the file, no need to
> > + * fall back to buffered IO.
> > + *
> > + * If there is block allocation needed(holes, EOF write), we fallocate
> > + * those blocks, mark them as unintialized
> > + * If those blocks were preallocated, we mark sure they are splited, but
> > + * still keep the range to write as unintialized.
> > + *
> > + * When end_io call back function called at the last IO complete time,
> > + * those extents will be converted to written extents.
> > + *
> > + */
> > +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> > + const struct iovec *iov, loff_t offset,
> > + unsigned long nr_segs)
> > +{
> > + struct file *file = iocb->ki_filp;
> > + struct inode *inode = file->f_mapping->host;
> > + ssize_t ret;
> > +
> > + if (rw == WRITE) {
> > + /*
> > + * For DIO we fallocate blocks for holes and end of file
> > + * write. Those fallocated extents are marked as uninitialized
> > + * to prevent paralel buffered read to expose the stale data
> > + * before DIO complete the data IO.
> > + * as for previously fallocated extents, ext4 get_block
> > + * will just simply mark the buffer mapped but still
> > + * keep the extents uninitialized.
> > + *
> > + * At the end of IO, the ext4 end_io callback function
> > + * will convert those unwritten extents to written,
> > + * and update on disk file size if the DIO expands the file.
> > + *
> > + */
> > + iocb->private = ext4_init_io_end(inode, DIO_UNWRITTEN);
> > + if (!iocb->private)
> > + return -ENOMEM;
> > +
> > + ret = blockdev_direct_IO(rw, iocb, inode,
> > + inode->i_sb->s_bdev, iov,
> > + offset, nr_segs,
> > + ext4_get_block_dio_write,
> > + ext4_end_io_dio);
> > + } else
> > + ret = blockdev_direct_IO(rw, iocb, inode,
> > + inode->i_sb->s_bdev, iov,
> > + offset, nr_segs,
> > + ext4_get_block_dio_read, NULL);
> > +
> > + /*
> > + * In the case of AIO DIO, VFS dio submitted the IO, but it
> > + * does not wait for io complete. To prevent expose stale
> > + * data after crash before IO complete,
> > + * i_disksize needs to be updated at the
> > + * time all the IO is completed, not here
> > + */
> Yeah, but so far at least ext3_direct_IO and ext4_ind_direct_IO
> happily update i_size here and noone reported the problem (yet) ;). The
> thing is that the current transaction has to commit for stale data to be
> visible and that sends a barrier which also forces blocks written by
> direct IO to the persistent storage. So at least if the underlying
> storage supports barriers, we are fine. If it does not, it could maybe
> reorder direct IO writes after the journal commit (it need not have
> reported the direct IO as done so far) and after a crash we would have
> a problem...
> It's just that I'm not sure that all the trouble with end_io and
> workqueues is worth it... More code = more bugs ;)
the end_io is there for direct write to fallocate. I am completely sure
if there is better way? We need to convert the extents to written at the
end of IO is completed, end_io call back seems works for this purpose.
The workqueue is for AIO case,it would be nice if we could do without it
for AIO. But read comments from xfs code it seems this is needed if we
use end_io call back.
> If we decided that we care enough to be really sure that we cannot
> expose dirty data in AIO DIO, we could just do some trick like have a
> list of running dios attached to the inode (protected by some spinlock)
> and wait for them during the transaction commit (we'd have to add a
> commit trigger for an inode but that's trivial).
>
I'll think about this...
Again, thanks for your input!
Mingming
> > + return ret;
> > +}
> > +
> > +static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
> > + const struct iovec *iov, loff_t offset,
> > + unsigned long nr_segs)
> > +{
> > + struct file *file = iocb->ki_filp;
> > + struct inode *inode = file->f_mapping->host;
> > +
> > + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
> > + return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
> > +
> > + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
> > +}
> > +
> > /*
> > * Pages can be marked dirty completely asynchronously from ext4's journalling
> > * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
>
> Honza
Hi,
On Wed 19-08-09 14:26:16, Mingming wrote:
> On Wed, 2009-08-19 at 16:15 +0200, Jan Kara wrote:
> > > Currently DIO VFS code passes create = 0 flag for write to the middle of
> > > file. It does this to avoid block allocation for holes, to prevent
> > > expose stale data out when there is parallel buffered read (which does
> > > not hold the i_mutex lock) while direct IO write has not completed. DIO
> > > request on holes finally falls back to buffered IO for this reason.
> > >
> > > Since preallocated extents are treated as holes when do a get_block()
> > > look up (buffer is not mapped), thus direct IO over fallocate also falls
> > > back to buffered IO. Thus ext4 actually silently falls back to buffered
> > > IO in above two cases.
> > >
> > > The basic idea to fix the direct IO on fallocate issue is to map the
> > > fallocated extents on direct IO write before submit the IO, but wait
> > > until the direct IO complete then convert the unwritten extent to
> > > written extent, using an end_io call back function. We will need to
> > > split the unwritten extents before submit the IO to prevent later
> > > ENOSPC, and when IO complete, just mark the written part initialized.
> > >
> > > With the end_io call back function, now it's possible to do direct IO on
> > > holes. For ext4 extent based file, since we support fallocate, we could
> > > fallocate blocks for holes, mark it mapped but unwritten. This way
> > > parallel buffered IO read returns zeros when parallel DIO write to the
> > > hole has not completed. When direct IO write complete, using the same
> > > end_io schemem to convert those fallocated hole to written extents.
> > Admittedly, I don't think this is the best solution... It can be done
> > but there are quite some corner cases to watch. See my comments in the
> > patch below.
> >
>
> Okay, are you worried about using unwritten extent and end_io call back
> function for direct write to holes? Or to preallocated space? I thought
> it's pretty straighforward for these two cases...
unwritten extents are a good idea. I'm more concerned about using end_io
for doing the conversion...
> > > For direct IO write to the end of file, we now also could get rid of
> > > using orphan list to protect expose stale data out after crash, when
> > > direct write to end of file isn't complete. We now fallocate blocks for
> > > the direct IO write to the end of file as well, and convert those
> > > fallocated blocks at the end of file to written when IO is complete. If
> > > fs crashed before the IO complete, it will only seen the file tail has
> > > been fallocated but won't get the stale data out.
> > But you still probably need orphan list to truncate blocks allocated
> > beyond file end during extending direct write. So I'd just remove this
> > paragraph...
> >
>
> Do we still need to truncate blocks allocated at the end of file? Those
> blocks are marked as uninitialized extents (as any block allocation from
> DIO are flagged as uninitialized extents, before IO is complete), so
> that after recover from crash, the stale data won't get exposed.
>
> I guess I missed the cases you concerned that we need the orphan list to
> protect, could you plain a little more?
Well, you are right it's not a security problem since no data gets
exposed. It's just that after a crash, there will be blocks allocated to
a file and it's not trivial to it find out unless you specifically stat the
file. So it's just *not nice* kind of thing. If it brings us some
advantage to not put the inode on the orphan list, then sure it might be
a tradeoff we are willing to make. But otherwise I see no point.
> > > 1) Block allocation needed for DIO write are fallocated, including holes
> > > and file tail write, marked as unwritten extents after block allocation.
> > >
> > > 2) those unwritten extents, and fallocate extents, will be converted to
> > > written extents (and update disk size when write to end of file)when the
> > > IO is complete. The conversion is triggered using end_io call back
> > > function passing from ext4 fs to direct IO.
> > >
> > > 3) For already fallocated extent, at the time try to map the fallocated
> > > extent, we will split the fallocated extent as necessary, mark the
> > > to-write fallocated extent mapped but still remains unwritten,
> > > insert the splitted extents, to prevent ENOSPC later.
> > >
> > > This first patch does 1) and 2), the second patch does 3)
> > >
> > > Patch against ext4 patch queue.
> > >
> > > Comments?
> > >
> > > Singed-Off-By: Mingming Cao <[email protected]>
> > > ---
> > > fs/ext4/ext4.h | 18 ++++
> > > fs/ext4/inode.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > > fs/ext4/super.c | 11 ++
> > > 3 files changed, 237 insertions(+), 3 deletions(-)
> > >
> > > Index: linux-2.6.31-rc4/fs/ext4/ext4.h
> > > ===================================================================
> > > --- linux-2.6.31-rc4.orig/fs/ext4/ext4.h 2009-08-09 14:46:10.000000000 -0700
> > > +++ linux-2.6.31-rc4/fs/ext4/ext4.h 2009-08-09 23:13:15.000000000 -0700
> > > @@ -111,6 +111,15 @@ struct ext4_allocation_request {
> > > unsigned int flags;
> > > };
> > >
> > > +typedef struct ext4_io_end{
> > > + struct inode *inode; /* file being written to */
> > > + unsigned int type; /* unwritten or written */
> > > + int error; /* I/O error code */
> > > + ext4_lblk_t offset; /* offset in the file */
> > > + size_t size; /* size of the extent */
> > > + struct work_struct work; /* data work queue */
> > > +}ext4_io_end_t;
> > > +
> > > /*
> > > * Special inodes numbers
> > > */
> > > @@ -330,8 +339,8 @@ struct ext4_new_group_data {
> > > /* Call ext4_da_update_reserve_space() after successfully
> > > allocating the blocks */
> > > #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
> > > -
> > > -
> > > +#define EXT4_GET_BLOCKS_DIO_CREATE_EXT 0x0011
> > > +#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT 0x0021
> > > /*
> > > * ioctl commands
> > > */
> > > @@ -960,6 +969,9 @@ struct ext4_sb_info {
> > >
> > > unsigned int s_log_groups_per_flex;
> > > struct flex_groups *s_flex_groups;
> > > +
> > > + /* workqueue for dio unwritten */
> > > + struct workqueue_struct *dio_unwritten_wq;
> > > };
> > >
> > > static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> > > @@ -1650,6 +1662,8 @@ extern void ext4_ext_init(struct super_b
> > > extern void ext4_ext_release(struct super_block *);
> > > extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
> > > loff_t len);
> > > +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
> > > + loff_t len);
> > > extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
> > > sector_t block, unsigned int max_blocks,
> > > struct buffer_head *bh, int flags);
> > > Index: linux-2.6.31-rc4/fs/ext4/super.c
> > > ===================================================================
> > > --- linux-2.6.31-rc4.orig/fs/ext4/super.c 2009-08-09 14:51:08.000000000 -0700
> > > +++ linux-2.6.31-rc4/fs/ext4/super.c 2009-08-09 14:51:14.000000000 -0700
> > > @@ -578,6 +578,9 @@ static void ext4_put_super(struct super_
> > > struct ext4_super_block *es = sbi->s_es;
> > > int i, err;
> > >
> > > + flush_workqueue(sbi->dio_unwritten_wq);
> > > + destroy_workqueue(sbi->dio_unwritten_wq);
> > > +
> > > lock_super(sb);
> > > lock_kernel();
> > > if (sb->s_dirt)
> > > @@ -2781,6 +2784,12 @@ no_journal:
> > > clear_opt(sbi->s_mount_opt, NOBH);
> > > }
> > > }
> > > + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
> > > + if (!EXT4_SB(sb)->dio_unwritten_wq) {
> > > + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
> > > + goto failed_mount_wq;
> > > + }
> > > +
> > > /*
> > > * The jbd2_journal_load will have done any necessary log recovery,
> > > * so we can safely mount the rest of the filesystem now.
> > > @@ -2893,6 +2902,8 @@ cantfind_ext4:
> > >
> > > failed_mount4:
> > > ext4_msg(sb, KERN_ERR, "mount failed");
> > > + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
> > > +failed_mount_wq:
> > > ext4_release_system_zone(sb);
> > > if (sbi->s_journal) {
> > > jbd2_journal_destroy(sbi->s_journal);
> > > Index: linux-2.6.31-rc4/fs/ext4/inode.c
> > > ===================================================================
> > > --- linux-2.6.31-rc4.orig/fs/ext4/inode.c 2009-08-09 14:46:32.000000000 -0700
> > > +++ linux-2.6.31-rc4/fs/ext4/inode.c 2009-08-09 14:56:40.000000000 -0700
> > > @@ -37,6 +37,7 @@
> > > #include <linux/namei.h>
> > > #include <linux/uio.h>
> > > #include <linux/bio.h>
> > > +#include <linux/workqueue.h>
> > >
> > > #include "ext4_jbd2.h"
> > > #include "xattr.h"
> > > @@ -3279,6 +3280,8 @@ static int ext4_releasepage(struct page
> > > }
> > >
> > > /*
> > > + * O_DIRECT for ext3 (or indirect map) based files
> > > + *
> > > * If the O_DIRECT write will extend the file then add this inode to the
> > > * orphan list. So recovery will truncate it back to the original size
> > > * if the machine crashes during the write.
> > > @@ -3287,7 +3290,7 @@ static int ext4_releasepage(struct page
> > > * crashes then stale disk data _may_ be exposed inside the file. But current
> > > * VFS code falls back into buffered path in that case so we are safe.
> > > */
> > > -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
> > > +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
> > > const struct iovec *iov, loff_t offset,
> > > unsigned long nr_segs)
> > > {
> > > @@ -3361,6 +3364,212 @@ out:
> > > return ret;
> > > }
> > >
> > > +struct workqueue_struct *ext4_unwritten_queue;
> > > +
> > > +/* Maximum number of blocks we map for direct IO at once. */
> > > +
> > > +static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
> > > + struct buffer_head *bh_result, int create)
> > > +{
> > > + handle_t *handle = NULL;
> > > + int ret = 0;
> > > + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
> > > + int dio_credits;
> > > +
> > > + /*
> > > + * DIO VFS code passes create = 0 flag for write to
> > > + * the middle of file. It does this to avoid block
> > > + * allocation for holes, to prevent expose stale data
> > > + * out when there is parallel buffered read (which does
> > > + * not hold the i_mutex lock) while direct IO write has
> > > + * not completed. DIO request on holes finally falls back
> > > + * to buffered IO for this reason.
> > > + *
> > > + * For ext4 extent based file, since we support fallocate,
> > > + * new allocated extent as uninitialized, for holes, we
> > > + * could fallocate blocks for holes, thus parallel
> > > + * buffered IO read will zero out the page when read on
> > > + * a hole while parallel DIO write to the hole has not completed.
> > > + *
> > > + * when we come here, we know it's a direct IO write,
> > > + * so it's safe to override the create flag from VFS.
> > > + */
> > > + create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
> > > +
> > > + if (max_blocks > DIO_MAX_BLOCKS)
> > > + max_blocks = DIO_MAX_BLOCKS;
> > > + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
> > > + handle = ext4_journal_start(inode, dio_credits);
> > > + if (IS_ERR(handle)) {
> > > + ret = PTR_ERR(handle);
> > > + goto out;
> > > + }
> > > + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
> > > + create);
> > > + if (ret > 0) {
> > > + bh_result->b_size = (ret << inode->i_blkbits);
> > > + ret = 0;
> > > + }
> > > + ext4_journal_stop(handle);
> > > +out:
> > > + return ret;
> > > +}
> > > +
> > > +static int ext4_get_block_dio_read(struct inode *inode, sector_t iblock,
> > > + struct buffer_head *bh_result, int create)
> > > +{
> > > + int ret = 0;
> > > + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
> > > + handle_t *handle = NULL;
> > > +
> > > + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
> > > + create);
> > > + if (ret > 0) {
> > > + bh_result->b_size = (ret << inode->i_blkbits);
> > > + ret = 0;
> > > + }
> > > + return ret;
> > > +}
> > Huh, what's the purpose of the above function? We can use normal
> > get_block, can't we?
> >
>
> This is pretty much a wrapper of this normal get_block function, except
> here we need to store the space mapped in b_size, DIO will check that.
But ext4_get_block() will do exactly the same, won't it?
> > > +
> > > +
> > > +#define DIO_UNWRITTEN 0x1
> > > +
> > > +/*
> > > + * IO write completion for unwritten extents.
> > > + *
> > > + * check a range of space and convert unwritten extents to written.
> > > + */
> > > +static void ext4_end_dio_unwritten(struct work_struct *work)
> > > +{
> > > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > > + struct inode *inode = io->inode;
> > > + loff_t offset = io->offset;
> > > + size_t size = io->size;
> > > + int ret = 0;
> > > +
> > > + ret = ext4_convert_unwritten_extents(inode, offset, size);
> > > +
> > > + if (ret < 0)
> > > + printk(KERN_EMERG "%s: failed to convert unwritten"
> > > + "extents to written extents, error is %d\n",
> > > + __func__, ret);
> > > + kfree(io);
> > > +}
> > Looking at ext4_convert_unwritten_extents(), you definitely miss some
> > locking. Since this is called completely asynchronously, you have to
> > protect against racing truncates and basically anything can happen with
> > the inode in the mean time.
>
> extents tree update is protected by the i_data_sem which will be hold at
> the ext4_get_blocks() called from ext4_convert_unwritten_extents.
Ah, I've missed that.
> perhaps should grab the i_mutex() which protects the inode update?
Probably we should because we update i_disksize inside
ext4_convert_unwritten_extents(). BTW, where do we update i_size? The same
place where we update i_disksize is probably right and that definitely
needs i_mutex.
> > It needn't be cached in the memory anymore!
>
> Right, we probably need to increase the reference to the inode before
> submit the IO, so the inode would not be push out of cache before IO
> completed.
Yes, that's possible but then you have to be prepared to handle deletes
of an inode from your worker thread because it can drop the last reference
to an already deleted inode.
> > Also fsync() has to flush all the updates for the inode it has in the
> > workqueue... Ditto for ext4_sync_fs().
> >
>
> I think we discussed about this before, and it seems there is no clear
> defination the DIO forces metadata update sync to disk before returns
> back to user apps... If we do force this in ext4 &DIO, doing fsync() on
> every DIO call is expensive.
No, I meant something else:
fd = open("file", O_DIRECT | O_RDWR);
write(fd, buf, size);
fsync(fd)
After fsync(fd) we *must* have everything on disk and reachable even if
we crashed just after fsync(). So conversion of extents from uninitialized
to initialized must happen during fsync() and it does not so far because we
have no connection from an inode to the work queued to the worker thread.
> If file is opened with sync node, then we need to flush all updates for
> inode it has done with DIO. This is what currently ext3 dio does, inode
> update (mtime etc).
Yes, but that's easier. Actually, I have a patch series which makes O_SYNC
handling use standard fsync() path so after that you wouldn't have to care
about it at all.
> > > +
> > > +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int type)
> > > +{
> > > + ext4_io_end_t *io = NULL;
> > > +
> > > + io = kmalloc(sizeof(*io), GFP_NOFS);
> > > +
> > > + if (io) {
> > > + io->inode = inode;
> > > + io->type = type;
> > > + io->offset = 0;
> > > + io->size = 0;
> > > + io->error = 0;
> > > + INIT_WORK(&io->work, ext4_end_dio_unwritten);
> > > + }
> > > +
> > > + return io;
> > > +}
> > > +
> > > +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> > > + ssize_t size, void *private)
> > > +{
> > > + ext4_io_end_t *io_end = iocb->private;
> > > + struct workqueue_struct *wq;
> > > +
> > > + /* if not hole or unwritten extents, just simple return */
> > > + if (!io_end || !size)
> > > + return;
> > > + io_end->offset = offset;
> > > + io_end->size = size;
> > > + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> > > +
> > > + /* We need to convert unwritten extents to written */
> > > + queue_work(wq, &io_end->work);
> > > +
> > > + if (is_sync_kiocb(iocb))
> > > + flush_workqueue(wq);
> > I don't think you can flush_workqueue here. end_io is called from
> > interrupt context and flush_workqueue blocks for a long time...
> > The wait should be done in ext4_direct_IO IMHO...
> >
>
> Okay, I will move it to ext4_direct_IO(), hmm..I think that is fine with
> AIO path as the is_sync_kiocb(iocb) will avoid that.
Yes, this concerns standard IO.
> If flush workqueue just kick off the work on the workqueue but not wait
> for it to complete (no fsync()), would that still be a big concern? BTW,
Waking up the worker thread is certainly fine even from the end_io.
> the flush workqueue only called when file is opened with sync mode.
I don't think so - is_sync_kiocb() just means that it is a standard
read/write and not one submitted via the aio interface (io_submit syscall
etc.).
> > > +
> > > + iocb->private = NULL;
> > > +}
> > > +/*
> > > + * For ext4 extent files, ext4 will do direct-io write to holes,
> > > + * preallocated extents, and those write extend the file, no need to
> > > + * fall back to buffered IO.
> > > + *
> > > + * If there is block allocation needed(holes, EOF write), we fallocate
> > > + * those blocks, mark them as unintialized
> > > + * If those blocks were preallocated, we mark sure they are splited, but
> > > + * still keep the range to write as unintialized.
> > > + *
> > > + * When end_io call back function called at the last IO complete time,
> > > + * those extents will be converted to written extents.
> > > + *
> > > + */
> > > +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> > > + const struct iovec *iov, loff_t offset,
> > > + unsigned long nr_segs)
> > > +{
> > > + struct file *file = iocb->ki_filp;
> > > + struct inode *inode = file->f_mapping->host;
> > > + ssize_t ret;
> > > +
> > > + if (rw == WRITE) {
> > > + /*
> > > + * For DIO we fallocate blocks for holes and end of file
> > > + * write. Those fallocated extents are marked as uninitialized
> > > + * to prevent paralel buffered read to expose the stale data
> > > + * before DIO complete the data IO.
> > > + * as for previously fallocated extents, ext4 get_block
> > > + * will just simply mark the buffer mapped but still
> > > + * keep the extents uninitialized.
> > > + *
> > > + * At the end of IO, the ext4 end_io callback function
> > > + * will convert those unwritten extents to written,
> > > + * and update on disk file size if the DIO expands the file.
> > > + *
> > > + */
> > > + iocb->private = ext4_init_io_end(inode, DIO_UNWRITTEN);
> > > + if (!iocb->private)
> > > + return -ENOMEM;
> > > +
> > > + ret = blockdev_direct_IO(rw, iocb, inode,
> > > + inode->i_sb->s_bdev, iov,
> > > + offset, nr_segs,
> > > + ext4_get_block_dio_write,
> > > + ext4_end_io_dio);
> > > + } else
> > > + ret = blockdev_direct_IO(rw, iocb, inode,
> > > + inode->i_sb->s_bdev, iov,
> > > + offset, nr_segs,
> > > + ext4_get_block_dio_read, NULL);
> > > +
> > > + /*
> > > + * In the case of AIO DIO, VFS dio submitted the IO, but it
> > > + * does not wait for io complete. To prevent expose stale
> > > + * data after crash before IO complete,
> > > + * i_disksize needs to be updated at the
> > > + * time all the IO is completed, not here
> > > + */
> > Yeah, but so far at least ext3_direct_IO and ext4_ind_direct_IO
> > happily update i_size here and noone reported the problem (yet) ;). The
> > thing is that the current transaction has to commit for stale data to be
> > visible and that sends a barrier which also forces blocks written by
> > direct IO to the persistent storage. So at least if the underlying
> > storage supports barriers, we are fine. If it does not, it could maybe
> > reorder direct IO writes after the journal commit (it need not have
> > reported the direct IO as done so far) and after a crash we would have
> > a problem...
> > It's just that I'm not sure that all the trouble with end_io and
> > workqueues is worth it... More code = more bugs ;)
>
> the end_io is there for direct write to fallocate. I am completely sure
> if there is better way? We need to convert the extents to written at the
> end of IO is completed, end_io call back seems works for this purpose.
>
> The workqueue is for AIO case,it would be nice if we could do without it
> for AIO. But read comments from xfs code it seems this is needed if we
> use end_io call back.
Yes, end_io callback is useful for triggering the conversion or
acknowledging that the transaction with the conversion can be committed
and I'm not against using it. Actually, looking at the XFS code, they also
do flush_workqueue() from the end_io callback. Interesting. I guess I'll
ask them why it's not a problem...
What I'm concerned about is the fact that we'd do the conversion from
completely asynchronous context and that opens all sorts of races. So I'd
find simpler to do the conversion from ext4_direct_IO and just make sure the
transaction is not committed before the IO is done (essentially what we
currently do with ordered buffers).
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed, Aug 19, 2009 at 04:15:57PM +0200, Jan Kara wrote:
> > +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> > + ssize_t size, void *private)
> > +{
> > + ext4_io_end_t *io_end = iocb->private;
> > + struct workqueue_struct *wq;
> > +
> > + /* if not hole or unwritten extents, just simple return */
> > + if (!io_end || !size)
> > + return;
> > + io_end->offset = offset;
> > + io_end->size = size;
> > + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> > +
> > + /* We need to convert unwritten extents to written */
> > + queue_work(wq, &io_end->work);
> > +
> > + if (is_sync_kiocb(iocb))
> > + flush_workqueue(wq);
> I don't think you can flush_workqueue here. end_io is called from
> interrupt context and flush_workqueue blocks for a long time...
> The wait should be done in ext4_direct_IO IMHO...
I don't see a problem here? This is a direct_io end_io callback, not
a bio callback; so it's only called from an interrupt context in the
async I/O case, and we only call flush_workqueue() when the kiocb is
synchronous.
- Ted
On Thu, 2009-08-20 at 13:52 +0200, Jan Kara wrote:
> Hi,
>
Hi Jan,
Sorry for the delayed response... I was out for a few days.
> On Wed 19-08-09 14:26:16, Mingming wrote:
> > On Wed, 2009-08-19 at 16:15 +0200, Jan Kara wrote:
> > > > Currently DIO VFS code passes create = 0 flag for write to the middle of
> > > > file. It does this to avoid block allocation for holes, to prevent
> > > > expose stale data out when there is parallel buffered read (which does
> > > > not hold the i_mutex lock) while direct IO write has not completed. DIO
> > > > request on holes finally falls back to buffered IO for this reason.
> > > >
> > > > Since preallocated extents are treated as holes when do a get_block()
> > > > look up (buffer is not mapped), thus direct IO over fallocate also falls
> > > > back to buffered IO. Thus ext4 actually silently falls back to buffered
> > > > IO in above two cases.
> > > >
> > > > The basic idea to fix the direct IO on fallocate issue is to map the
> > > > fallocated extents on direct IO write before submit the IO, but wait
> > > > until the direct IO complete then convert the unwritten extent to
> > > > written extent, using an end_io call back function. We will need to
> > > > split the unwritten extents before submit the IO to prevent later
> > > > ENOSPC, and when IO complete, just mark the written part initialized.
> > > >
> > > > With the end_io call back function, now it's possible to do direct IO on
> > > > holes. For ext4 extent based file, since we support fallocate, we could
> > > > fallocate blocks for holes, mark it mapped but unwritten. This way
> > > > parallel buffered IO read returns zeros when parallel DIO write to the
> > > > hole has not completed. When direct IO write complete, using the same
> > > > end_io schemem to convert those fallocated hole to written extents.
> > > Admittedly, I don't think this is the best solution... It can be done
> > > but there are quite some corner cases to watch. See my comments in the
> > > patch below.
> > >
> >
> > Okay, are you worried about using unwritten extent and end_io call back
> > function for direct write to holes? Or to preallocated space? I thought
> > it's pretty straighforward for these two cases...
> unwritten extents are a good idea. I'm more concerned about using end_io
> for doing the conversion...
>
> > > > For direct IO write to the end of file, we now also could get rid of
> > > > using orphan list to protect expose stale data out after crash, when
> > > > direct write to end of file isn't complete. We now fallocate blocks for
> > > > the direct IO write to the end of file as well, and convert those
> > > > fallocated blocks at the end of file to written when IO is complete. If
> > > > fs crashed before the IO complete, it will only seen the file tail has
> > > > been fallocated but won't get the stale data out.
> > > But you still probably need orphan list to truncate blocks allocated
> > > beyond file end during extending direct write. So I'd just remove this
> > > paragraph...
> > >
> >
> > Do we still need to truncate blocks allocated at the end of file? Those
> > blocks are marked as uninitialized extents (as any block allocation from
> > DIO are flagged as uninitialized extents, before IO is complete), so
> > that after recover from crash, the stale data won't get exposed.
> >
> > I guess I missed the cases you concerned that we need the orphan list to
> > protect, could you plain a little more?
> Well, you are right it's not a security problem since no data gets
> exposed. It's just that after a crash, there will be blocks allocated to
> a file and it's not trivial to it find out unless you specifically stat the
> file. So it's just *not nice* kind of thing. If it brings us some
> advantage to not put the inode on the orphan list, then sure it might be
> a tradeoff we are willing to make. But otherwise I see no point.
>
Hmm... I see what you concerned now...user probably don't like allocated
blocks at the end...
I am trying to think of the ext3 case. In ext3 case, inode will be
removed from orphan list once the IO is complete, but that is before the
i_disksize get updated and journal handle being closed. What if the
system crash after inode removed from orphan list but before the
i_disksize get updated?
But since the journal handled has not marked as stopped, even without
putting the inode on the orpah list, wouldn't the open journal handle
and delayed i_disksize update until IO complete time, prevent we see
half DIO data on disk? Though blocks are allocated to the file, but
those block mapping wouldn't show up on disk, no? Did I miss anything?
In the ext4 +patch case (non AIO case), we also close the handle when
end_io completed. If the system crashed we would see the blocks are
allocated but marked as unwritten.
> > > > 1) Block allocation needed for DIO write are fallocated, including holes
> > > > and file tail write, marked as unwritten extents after block allocation.
> > > >
> > > > 2) those unwritten extents, and fallocate extents, will be converted to
> > > > written extents (and update disk size when write to end of file)when the
> > > > IO is complete. The conversion is triggered using end_io call back
> > > > function passing from ext4 fs to direct IO.
> > > >
> > > > 3) For already fallocated extent, at the time try to map the fallocated
> > > > extent, we will split the fallocated extent as necessary, mark the
> > > > to-write fallocated extent mapped but still remains unwritten,
> > > > insert the splitted extents, to prevent ENOSPC later.
> > > >
> > > > This first patch does 1) and 2), the second patch does 3)
> > > >
> > > > Patch against ext4 patch queue.
> > > >
> > > > Comments?
> > > >
> > > > Singed-Off-By: Mingming Cao <[email protected]>
> > > > ---
> > > > fs/ext4/ext4.h | 18 ++++
> > > > fs/ext4/inode.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > fs/ext4/super.c | 11 ++
> > > > 3 files changed, 237 insertions(+), 3 deletions(-)
> > > >
> > > > Index: linux-2.6.31-rc4/fs/ext4/ext4.h
> > > > ===================================================================
> > > > --- linux-2.6.31-rc4.orig/fs/ext4/ext4.h 2009-08-09 14:46:10.000000000 -0700
> > > > +++ linux-2.6.31-rc4/fs/ext4/ext4.h 2009-08-09 23:13:15.000000000 -0700
> > > > @@ -111,6 +111,15 @@ struct ext4_allocation_request {
> > > > unsigned int flags;
> > > > };
> > > >
> > > > +typedef struct ext4_io_end{
> > > > + struct inode *inode; /* file being written to */
> > > > + unsigned int type; /* unwritten or written */
> > > > + int error; /* I/O error code */
> > > > + ext4_lblk_t offset; /* offset in the file */
> > > > + size_t size; /* size of the extent */
> > > > + struct work_struct work; /* data work queue */
> > > > +}ext4_io_end_t;
> > > > +
> > > > /*
> > > > * Special inodes numbers
> > > > */
> > > > @@ -330,8 +339,8 @@ struct ext4_new_group_data {
> > > > /* Call ext4_da_update_reserve_space() after successfully
> > > > allocating the blocks */
> > > > #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
> > > > -
> > > > -
> > > > +#define EXT4_GET_BLOCKS_DIO_CREATE_EXT 0x0011
> > > > +#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT 0x0021
> > > > /*
> > > > * ioctl commands
> > > > */
> > > > @@ -960,6 +969,9 @@ struct ext4_sb_info {
> > > >
> > > > unsigned int s_log_groups_per_flex;
> > > > struct flex_groups *s_flex_groups;
> > > > +
> > > > + /* workqueue for dio unwritten */
> > > > + struct workqueue_struct *dio_unwritten_wq;
> > > > };
> > > >
> > > > static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> > > > @@ -1650,6 +1662,8 @@ extern void ext4_ext_init(struct super_b
> > > > extern void ext4_ext_release(struct super_block *);
> > > > extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
> > > > loff_t len);
> > > > +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
> > > > + loff_t len);
> > > > extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
> > > > sector_t block, unsigned int max_blocks,
> > > > struct buffer_head *bh, int flags);
> > > > Index: linux-2.6.31-rc4/fs/ext4/super.c
> > > > ===================================================================
> > > > --- linux-2.6.31-rc4.orig/fs/ext4/super.c 2009-08-09 14:51:08.000000000 -0700
> > > > +++ linux-2.6.31-rc4/fs/ext4/super.c 2009-08-09 14:51:14.000000000 -0700
> > > > @@ -578,6 +578,9 @@ static void ext4_put_super(struct super_
> > > > struct ext4_super_block *es = sbi->s_es;
> > > > int i, err;
> > > >
> > > > + flush_workqueue(sbi->dio_unwritten_wq);
> > > > + destroy_workqueue(sbi->dio_unwritten_wq);
> > > > +
> > > > lock_super(sb);
> > > > lock_kernel();
> > > > if (sb->s_dirt)
> > > > @@ -2781,6 +2784,12 @@ no_journal:
> > > > clear_opt(sbi->s_mount_opt, NOBH);
> > > > }
> > > > }
> > > > + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
> > > > + if (!EXT4_SB(sb)->dio_unwritten_wq) {
> > > > + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
> > > > + goto failed_mount_wq;
> > > > + }
> > > > +
> > > > /*
> > > > * The jbd2_journal_load will have done any necessary log recovery,
> > > > * so we can safely mount the rest of the filesystem now.
> > > > @@ -2893,6 +2902,8 @@ cantfind_ext4:
> > > >
> > > > failed_mount4:
> > > > ext4_msg(sb, KERN_ERR, "mount failed");
> > > > + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
> > > > +failed_mount_wq:
> > > > ext4_release_system_zone(sb);
> > > > if (sbi->s_journal) {
> > > > jbd2_journal_destroy(sbi->s_journal);
> > > > Index: linux-2.6.31-rc4/fs/ext4/inode.c
> > > > ===================================================================
> > > > --- linux-2.6.31-rc4.orig/fs/ext4/inode.c 2009-08-09 14:46:32.000000000 -0700
> > > > +++ linux-2.6.31-rc4/fs/ext4/inode.c 2009-08-09 14:56:40.000000000 -0700
> > > > @@ -37,6 +37,7 @@
> > > > #include <linux/namei.h>
> > > > #include <linux/uio.h>
> > > > #include <linux/bio.h>
> > > > +#include <linux/workqueue.h>
> > > >
> > > > #include "ext4_jbd2.h"
> > > > #include "xattr.h"
> > > > @@ -3279,6 +3280,8 @@ static int ext4_releasepage(struct page
> > > > }
> > > >
> > > > /*
> > > > + * O_DIRECT for ext3 (or indirect map) based files
> > > > + *
> > > > * If the O_DIRECT write will extend the file then add this inode to the
> > > > * orphan list. So recovery will truncate it back to the original size
> > > > * if the machine crashes during the write.
> > > > @@ -3287,7 +3290,7 @@ static int ext4_releasepage(struct page
> > > > * crashes then stale disk data _may_ be exposed inside the file. But current
> > > > * VFS code falls back into buffered path in that case so we are safe.
> > > > */
> > > > -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
> > > > +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
> > > > const struct iovec *iov, loff_t offset,
> > > > unsigned long nr_segs)
> > > > {
> > > > @@ -3361,6 +3364,212 @@ out:
> > > > return ret;
> > > > }
> > > >
> > > > +struct workqueue_struct *ext4_unwritten_queue;
> > > > +
> > > > +/* Maximum number of blocks we map for direct IO at once. */
> > > > +
> > > > +static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
> > > > + struct buffer_head *bh_result, int create)
> > > > +{
> > > > + handle_t *handle = NULL;
> > > > + int ret = 0;
> > > > + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
> > > > + int dio_credits;
> > > > +
> > > > + /*
> > > > + * DIO VFS code passes create = 0 flag for write to
> > > > + * the middle of file. It does this to avoid block
> > > > + * allocation for holes, to prevent expose stale data
> > > > + * out when there is parallel buffered read (which does
> > > > + * not hold the i_mutex lock) while direct IO write has
> > > > + * not completed. DIO request on holes finally falls back
> > > > + * to buffered IO for this reason.
> > > > + *
> > > > + * For ext4 extent based file, since we support fallocate,
> > > > + * new allocated extent as uninitialized, for holes, we
> > > > + * could fallocate blocks for holes, thus parallel
> > > > + * buffered IO read will zero out the page when read on
> > > > + * a hole while parallel DIO write to the hole has not completed.
> > > > + *
> > > > + * when we come here, we know it's a direct IO write,
> > > > + * so it's safe to override the create flag from VFS.
> > > > + */
> > > > + create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
> > > > +
> > > > + if (max_blocks > DIO_MAX_BLOCKS)
> > > > + max_blocks = DIO_MAX_BLOCKS;
> > > > + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
> > > > + handle = ext4_journal_start(inode, dio_credits);
> > > > + if (IS_ERR(handle)) {
> > > > + ret = PTR_ERR(handle);
> > > > + goto out;
> > > > + }
> > > > + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
> > > > + create);
> > > > + if (ret > 0) {
> > > > + bh_result->b_size = (ret << inode->i_blkbits);
> > > > + ret = 0;
> > > > + }
> > > > + ext4_journal_stop(handle);
> > > > +out:
> > > > + return ret;
> > > > +}
> > > > +
> > > > +static int ext4_get_block_dio_read(struct inode *inode, sector_t iblock,
> > > > + struct buffer_head *bh_result, int create)
> > > > +{
> > > > + int ret = 0;
> > > > + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
> > > > + handle_t *handle = NULL;
> > > > +
> > > > + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
> > > > + create);
> > > > + if (ret > 0) {
> > > > + bh_result->b_size = (ret << inode->i_blkbits);
> > > > + ret = 0;
> > > > + }
> > > > + return ret;
> > > > +}
> > > Huh, what's the purpose of the above function? We can use normal
> > > get_block, can't we?
> > >
> >
> > This is pretty much a wrapper of this normal get_block function, except
> > here we need to store the space mapped in b_size, DIO will check that.
> But ext4_get_block() will do exactly the same, won't it?
>
Ah, it does the same, with a little more check for "read" or "write"...I
am fine to use ext4_get_block() directly. Will change it as you
suggested.
> > > > +
> > > > +
> > > > +#define DIO_UNWRITTEN 0x1
> > > > +
> > > > +/*
> > > > + * IO write completion for unwritten extents.
> > > > + *
> > > > + * check a range of space and convert unwritten extents to written.
> > > > + */
> > > > +static void ext4_end_dio_unwritten(struct work_struct *work)
> > > > +{
> > > > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > > > + struct inode *inode = io->inode;
> > > > + loff_t offset = io->offset;
> > > > + size_t size = io->size;
> > > > + int ret = 0;
> > > > +
> > > > + ret = ext4_convert_unwritten_extents(inode, offset, size);
> > > > +
> > > > + if (ret < 0)
> > > > + printk(KERN_EMERG "%s: failed to convert unwritten"
> > > > + "extents to written extents, error is %d\n",
> > > > + __func__, ret);
> > > > + kfree(io);
> > > > +}
> > > Looking at ext4_convert_unwritten_extents(), you definitely miss some
> > > locking. Since this is called completely asynchronously, you have to
> > > protect against racing truncates and basically anything can happen with
> > > the inode in the mean time.
> >
> > extents tree update is protected by the i_data_sem which will be hold at
> > the ext4_get_blocks() called from ext4_convert_unwritten_extents.
> Ah, I've missed that.
>
> > perhaps should grab the i_mutex() which protects the inode update?
> Probably we should because we update i_disksize inside
> ext4_convert_unwritten_extents(). BTW, where do we update i_size? The same
> place where we update i_disksize is probably right and that definitely
> needs i_mutex.
>
We do update i_size in generic_file_direct_write(), there I think proper
locking (i_mutex lock) is hold.
For i_disksize update, with this patch, we call ext4_update_i_disksize()
from ext4_convert_unwritten_extents(), at the end of IO is completed.
ext4_update_i_disksize() grab the i_data_sem to prevent race with
truncate.
Now I think we are fine with race with truncate... no?
> > > It needn't be cached in the memory anymore!
> >
> > Right, we probably need to increase the reference to the inode before
> > submit the IO, so the inode would not be push out of cache before IO
> > completed.
> Yes, that's possible but then you have to be prepared to handle deletes
> of an inode from your worker thread because it can drop the last reference
> to an already deleted inode.
>
> > > Also fsync() has to flush all the updates for the inode it has in the
> > > workqueue... Ditto for ext4_sync_fs().
> > >
> >
> > I think we discussed about this before, and it seems there is no clear
> > defination the DIO forces metadata update sync to disk before returns
> > back to user apps... If we do force this in ext4 &DIO, doing fsync() on
> > every DIO call is expensive.
> No, I meant something else:
> fd = open("file", O_DIRECT | O_RDWR);
> write(fd, buf, size);
> fsync(fd)
>
> After fsync(fd) we *must* have everything on disk and reachable even if
> we crashed just after fsync(). So conversion of extents from uninitialized
> to initialized must happen during fsync() and it does not so far because we
> have no connection from an inode to the work queued to the worker thread.
>
Are you worried about AIO DIO case?
Because for non AIO case, when DIO returns back to user, the data IO has
already completed, and the conversion of extents from uninitialized to
initialized has already being kicked by flush_workqueue(). Since the
conversion is being journaled, the aftweward fsync() should able to
commit all transactions, thus force the conversion to be complete on
disk after fsync.
For the AIO DIO case, yeah I agree there might be a issue.... Hmm.. if
we do fsync() after AIO DIO, currently I couldn't see any way fsync()
could ensure all the pending data IOs from AIO DIO path would reach to
disk...
if there is a way fsycn() could wait for all pending data IOs from AIO
DIO to complete, then end_io callback would able to trigger the
conversion, and fsycn() would able to flush the metedata update hit to
disk.
> > If file is opened with sync node, then we need to flush all updates for
> > inode it has done with DIO. This is what currently ext3 dio does, inode
> > update (mtime etc).
> Yes, but that's easier. Actually, I have a patch series which makes O_SYNC
> handling use standard fsync() path so after that you wouldn't have to care
> about it at all.
>
> > > > +
> > > > +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int type)
> > > > +{
> > > > + ext4_io_end_t *io = NULL;
> > > > +
> > > > + io = kmalloc(sizeof(*io), GFP_NOFS);
> > > > +
> > > > + if (io) {
> > > > + io->inode = inode;
> > > > + io->type = type;
> > > > + io->offset = 0;
> > > > + io->size = 0;
> > > > + io->error = 0;
> > > > + INIT_WORK(&io->work, ext4_end_dio_unwritten);
> > > > + }
> > > > +
> > > > + return io;
> > > > +}
> > > > +
> > > > +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> > > > + ssize_t size, void *private)
> > > > +{
> > > > + ext4_io_end_t *io_end = iocb->private;
> > > > + struct workqueue_struct *wq;
> > > > +
> > > > + /* if not hole or unwritten extents, just simple return */
> > > > + if (!io_end || !size)
> > > > + return;
> > > > + io_end->offset = offset;
> > > > + io_end->size = size;
> > > > + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> > > > +
> > > > + /* We need to convert unwritten extents to written */
> > > > + queue_work(wq, &io_end->work);
> > > > +
> > > > + if (is_sync_kiocb(iocb))
> > > > + flush_workqueue(wq);
> > > I don't think you can flush_workqueue here. end_io is called from
> > > interrupt context and flush_workqueue blocks for a long time...
> > > The wait should be done in ext4_direct_IO IMHO...
> > >
> >
> > Okay, I will move it to ext4_direct_IO(), hmm..I think that is fine with
> > AIO path as the is_sync_kiocb(iocb) will avoid that.
> Yes, this concerns standard IO.
>
> > If flush workqueue just kick off the work on the workqueue but not wait
> > for it to complete (no fsync()), would that still be a big concern? BTW,
> Waking up the worker thread is certainly fine even from the end_io.
>
Okay, I guess I didn't explain this well with my patch. The
flush_workqueue() here will call ext4_convert_unwritten_extents(), which
just does the conversion, but ext4_convert_unwritten_extents() does not
force the journal commit.
> > the flush workqueue only called when file is opened with sync mode.
> I don't think so - is_sync_kiocb() just means that it is a standard
> read/write and not one submitted via the aio interface (io_submit syscall
> etc.).
>
ah okay, I responsed too soon. yeah, for the DIO regular case (even if
file doesn't open with sync mode), we do ensure the worker thread kick
off the conversion and log the conversion. That's right behavior, I
think.
> > > > +
> > > > + iocb->private = NULL;
> > > > +}
> > > > +/*
> > > > + * For ext4 extent files, ext4 will do direct-io write to holes,
> > > > + * preallocated extents, and those write extend the file, no need to
> > > > + * fall back to buffered IO.
> > > > + *
> > > > + * If there is block allocation needed(holes, EOF write), we fallocate
> > > > + * those blocks, mark them as unintialized
> > > > + * If those blocks were preallocated, we mark sure they are splited, but
> > > > + * still keep the range to write as unintialized.
> > > > + *
> > > > + * When end_io call back function called at the last IO complete time,
> > > > + * those extents will be converted to written extents.
> > > > + *
> > > > + */
> > > > +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> > > > + const struct iovec *iov, loff_t offset,
> > > > + unsigned long nr_segs)
> > > > +{
> > > > + struct file *file = iocb->ki_filp;
> > > > + struct inode *inode = file->f_mapping->host;
> > > > + ssize_t ret;
> > > > +
> > > > + if (rw == WRITE) {
> > > > + /*
> > > > + * For DIO we fallocate blocks for holes and end of file
> > > > + * write. Those fallocated extents are marked as uninitialized
> > > > + * to prevent paralel buffered read to expose the stale data
> > > > + * before DIO complete the data IO.
> > > > + * as for previously fallocated extents, ext4 get_block
> > > > + * will just simply mark the buffer mapped but still
> > > > + * keep the extents uninitialized.
> > > > + *
> > > > + * At the end of IO, the ext4 end_io callback function
> > > > + * will convert those unwritten extents to written,
> > > > + * and update on disk file size if the DIO expands the file.
> > > > + *
> > > > + */
> > > > + iocb->private = ext4_init_io_end(inode, DIO_UNWRITTEN);
> > > > + if (!iocb->private)
> > > > + return -ENOMEM;
> > > > +
> > > > + ret = blockdev_direct_IO(rw, iocb, inode,
> > > > + inode->i_sb->s_bdev, iov,
> > > > + offset, nr_segs,
> > > > + ext4_get_block_dio_write,
> > > > + ext4_end_io_dio);
> > > > + } else
> > > > + ret = blockdev_direct_IO(rw, iocb, inode,
> > > > + inode->i_sb->s_bdev, iov,
> > > > + offset, nr_segs,
> > > > + ext4_get_block_dio_read, NULL);
> > > > +
> > > > + /*
> > > > + * In the case of AIO DIO, VFS dio submitted the IO, but it
> > > > + * does not wait for io complete. To prevent expose stale
> > > > + * data after crash before IO complete,
> > > > + * i_disksize needs to be updated at the
> > > > + * time all the IO is completed, not here
> > > > + */
> > > Yeah, but so far at least ext3_direct_IO and ext4_ind_direct_IO
> > > happily update i_size here and noone reported the problem (yet) ;). The
> > > thing is that the current transaction has to commit for stale data to be
> > > visible and that sends a barrier which also forces blocks written by
> > > direct IO to the persistent storage. So at least if the underlying
> > > storage supports barriers, we are fine. If it does not, it could maybe
> > > reorder direct IO writes after the journal commit (it need not have
> > > reported the direct IO as done so far) and after a crash we would have
> > > a problem...
> > > It's just that I'm not sure that all the trouble with end_io and
> > > workqueues is worth it... More code = more bugs ;)
> >
> > the end_io is there for direct write to fallocate. I am completely sure
> > if there is better way? We need to convert the extents to written at the
> > end of IO is completed, end_io call back seems works for this purpose.
> >
> > The workqueue is for AIO case,it would be nice if we could do without it
> > for AIO. But read comments from xfs code it seems this is needed if we
> > use end_io call back.
> Yes, end_io callback is useful for triggering the conversion or
> acknowledging that the transaction with the conversion can be committed
> and I'm not against using it. Actually, looking at the XFS code, they also
> do flush_workqueue() from the end_io callback. Interesting. I guess I'll
> ask them why it's not a problem...
> What I'm concerned about is the fact that we'd do the conversion from
> completely asynchronous context and that opens all sorts of races.
Yeah in the AIO case we do the conversion from asynchronous context,
but for the non AIO case, the conversion is done under process context
still. I suspect the AIO DIO today for ext3 update the on disk size
before IO is complete is a bigger issue...:)
> So I'd
> find simpler to do the conversion from ext4_direct_IO and just make sure the
> transaction is not committed before the IO is done (essentially what we
> currently do with ordered buffers).
>
True, that might workable....I am concerned this won't work for no
journal mode, though this mode is not commonly used now.:)
Thanks,
Mingming
> Honza
On Mon 24-08-09 15:11:38, Theodore Tso wrote:
> On Wed, Aug 19, 2009 at 04:15:57PM +0200, Jan Kara wrote:
> > > +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> > > + ssize_t size, void *private)
> > > +{
> > > + ext4_io_end_t *io_end = iocb->private;
> > > + struct workqueue_struct *wq;
> > > +
> > > + /* if not hole or unwritten extents, just simple return */
> > > + if (!io_end || !size)
> > > + return;
> > > + io_end->offset = offset;
> > > + io_end->size = size;
> > > + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> > > +
> > > + /* We need to convert unwritten extents to written */
> > > + queue_work(wq, &io_end->work);
> > > +
> > > + if (is_sync_kiocb(iocb))
> > > + flush_workqueue(wq);
> > I don't think you can flush_workqueue here. end_io is called from
> > interrupt context and flush_workqueue blocks for a long time...
> > The wait should be done in ext4_direct_IO IMHO...
>
> I don't see a problem here? This is a direct_io end_io callback, not
> a bio callback; so it's only called from an interrupt context in the
> async I/O case, and we only call flush_workqueue() when the kiocb is
> synchronous.
Ah, right, I didn't realize that DIO end_io callback gets called from
an interrupt context only for async IO. That also explains why XFS is fine
doing the same thing. Thanks for the explanation.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mon 24-08-09 14:38:13, Mingming wrote:
> On Thu, 2009-08-20 at 13:52 +0200, Jan Kara wrote:
> > On Wed 19-08-09 14:26:16, Mingming wrote:
> > > On Wed, 2009-08-19 at 16:15 +0200, Jan Kara wrote:
> > > > > For direct IO write to the end of file, we now also could get rid of
> > > > > using orphan list to protect expose stale data out after crash, when
> > > > > direct write to end of file isn't complete. We now fallocate blocks for
> > > > > the direct IO write to the end of file as well, and convert those
> > > > > fallocated blocks at the end of file to written when IO is complete. If
> > > > > fs crashed before the IO complete, it will only seen the file tail has
> > > > > been fallocated but won't get the stale data out.
> > > > But you still probably need orphan list to truncate blocks allocated
> > > > beyond file end during extending direct write. So I'd just remove this
> > > > paragraph...
> > > >
> > >
> > > Do we still need to truncate blocks allocated at the end of file? Those
> > > blocks are marked as uninitialized extents (as any block allocation from
> > > DIO are flagged as uninitialized extents, before IO is complete), so
> > > that after recover from crash, the stale data won't get exposed.
> > >
> > > I guess I missed the cases you concerned that we need the orphan list to
> > > protect, could you plain a little more?
> > Well, you are right it's not a security problem since no data gets
> > exposed. It's just that after a crash, there will be blocks allocated to
> > a file and it's not trivial to it find out unless you specifically stat the
> > file. So it's just *not nice* kind of thing. If it brings us some
> > advantage to not put the inode on the orphan list, then sure it might be
> > a tradeoff we are willing to make. But otherwise I see no point.
> >
>
> Hmm... I see what you concerned now...user probably don't like allocated
> blocks at the end...
Yes, it's kind of counterintuitive that blocks can get allocated but
don't really "show up" in the file (because they are unwritten and beyond
i_size).
> I am trying to think of the ext3 case. In ext3 case, inode will be
> removed from orphan list once the IO is complete, but that is before the
> i_disksize get updated and journal handle being closed. What if the
> system crash after inode removed from orphan list but before the
> i_disksize get updated?
As you write below, until we stop the handle, the transaction cannot be
committed and so no change actually gets written to disk.
> But since the journal handled has not marked as stopped, even without
> putting the inode on the orpah list, wouldn't the open journal handle
> and delayed i_disksize update until IO complete time, prevent we see
> half DIO data on disk? Though blocks are allocated to the file, but
> those block mapping wouldn't show up on disk, no? Did I miss anything?
>
> In the ext4 +patch case (non AIO case), we also close the handle when
> end_io completed. If the system crashed we would see the blocks are
> allocated but marked as unwritten.
Exactly, you end up with allocated and unwritten blocks beyond i_size and
that's what I'd like to avoid if user did not explicitely fallocate() these
blocks.
> > > > > 1) Block allocation needed for DIO write are fallocated, including holes
> > > > > and file tail write, marked as unwritten extents after block allocation.
> > > > >
> > > > > 2) those unwritten extents, and fallocate extents, will be converted to
> > > > > written extents (and update disk size when write to end of file)when the
> > > > > IO is complete. The conversion is triggered using end_io call back
> > > > > function passing from ext4 fs to direct IO.
> > > > >
> > > > > 3) For already fallocated extent, at the time try to map the fallocated
> > > > > extent, we will split the fallocated extent as necessary, mark the
> > > > > to-write fallocated extent mapped but still remains unwritten,
> > > > > insert the splitted extents, to prevent ENOSPC later.
> > > > >
> > > > > This first patch does 1) and 2), the second patch does 3)
> > > > >
> > > > > Patch against ext4 patch queue.
> > > > >
> > > > > Comments?
> > > > >
> > > > > Singed-Off-By: Mingming Cao <[email protected]>
> > > > > ---
> > > > > fs/ext4/ext4.h | 18 ++++
> > > > > fs/ext4/inode.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > fs/ext4/super.c | 11 ++
> > > > > 3 files changed, 237 insertions(+), 3 deletions(-)
> > > > >
> > > > > Index: linux-2.6.31-rc4/fs/ext4/ext4.h
> > > > > ===================================================================
> > > > > --- linux-2.6.31-rc4.orig/fs/ext4/ext4.h 2009-08-09 14:46:10.000000000 -0700
> > > > > +++ linux-2.6.31-rc4/fs/ext4/ext4.h 2009-08-09 23:13:15.000000000 -0700
> > > > > @@ -111,6 +111,15 @@ struct ext4_allocation_request {
> > > > > unsigned int flags;
> > > > > };
> > > > >
> > > > > +typedef struct ext4_io_end{
> > > > > + struct inode *inode; /* file being written to */
> > > > > + unsigned int type; /* unwritten or written */
> > > > > + int error; /* I/O error code */
> > > > > + ext4_lblk_t offset; /* offset in the file */
> > > > > + size_t size; /* size of the extent */
> > > > > + struct work_struct work; /* data work queue */
> > > > > +}ext4_io_end_t;
> > > > > +
> > > > > /*
> > > > > * Special inodes numbers
> > > > > */
> > > > > @@ -330,8 +339,8 @@ struct ext4_new_group_data {
> > > > > /* Call ext4_da_update_reserve_space() after successfully
> > > > > allocating the blocks */
> > > > > #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
> > > > > -
> > > > > -
> > > > > +#define EXT4_GET_BLOCKS_DIO_CREATE_EXT 0x0011
> > > > > +#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT 0x0021
> > > > > /*
> > > > > * ioctl commands
> > > > > */
> > > > > @@ -960,6 +969,9 @@ struct ext4_sb_info {
> > > > >
> > > > > unsigned int s_log_groups_per_flex;
> > > > > struct flex_groups *s_flex_groups;
> > > > > +
> > > > > + /* workqueue for dio unwritten */
> > > > > + struct workqueue_struct *dio_unwritten_wq;
> > > > > };
> > > > >
> > > > > static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> > > > > @@ -1650,6 +1662,8 @@ extern void ext4_ext_init(struct super_b
> > > > > extern void ext4_ext_release(struct super_block *);
> > > > > extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
> > > > > loff_t len);
> > > > > +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
> > > > > + loff_t len);
> > > > > extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
> > > > > sector_t block, unsigned int max_blocks,
> > > > > struct buffer_head *bh, int flags);
> > > > > Index: linux-2.6.31-rc4/fs/ext4/super.c
> > > > > ===================================================================
> > > > > --- linux-2.6.31-rc4.orig/fs/ext4/super.c 2009-08-09 14:51:08.000000000 -0700
> > > > > +++ linux-2.6.31-rc4/fs/ext4/super.c 2009-08-09 14:51:14.000000000 -0700
> > > > > @@ -578,6 +578,9 @@ static void ext4_put_super(struct super_
> > > > > struct ext4_super_block *es = sbi->s_es;
> > > > > int i, err;
> > > > >
> > > > > + flush_workqueue(sbi->dio_unwritten_wq);
> > > > > + destroy_workqueue(sbi->dio_unwritten_wq);
> > > > > +
> > > > > lock_super(sb);
> > > > > lock_kernel();
> > > > > if (sb->s_dirt)
> > > > > @@ -2781,6 +2784,12 @@ no_journal:
> > > > > clear_opt(sbi->s_mount_opt, NOBH);
> > > > > }
> > > > > }
> > > > > + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
> > > > > + if (!EXT4_SB(sb)->dio_unwritten_wq) {
> > > > > + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
> > > > > + goto failed_mount_wq;
> > > > > + }
> > > > > +
> > > > > /*
> > > > > * The jbd2_journal_load will have done any necessary log recovery,
> > > > > * so we can safely mount the rest of the filesystem now.
> > > > > @@ -2893,6 +2902,8 @@ cantfind_ext4:
> > > > >
> > > > > failed_mount4:
> > > > > ext4_msg(sb, KERN_ERR, "mount failed");
> > > > > + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
> > > > > +failed_mount_wq:
> > > > > ext4_release_system_zone(sb);
> > > > > if (sbi->s_journal) {
> > > > > jbd2_journal_destroy(sbi->s_journal);
> > > > > Index: linux-2.6.31-rc4/fs/ext4/inode.c
> > > > > ===================================================================
> > > > > --- linux-2.6.31-rc4.orig/fs/ext4/inode.c 2009-08-09 14:46:32.000000000 -0700
> > > > > +++ linux-2.6.31-rc4/fs/ext4/inode.c 2009-08-09 14:56:40.000000000 -0700
> > > > > @@ -37,6 +37,7 @@
> > > > > #include <linux/namei.h>
> > > > > #include <linux/uio.h>
> > > > > #include <linux/bio.h>
> > > > > +#include <linux/workqueue.h>
> > > > >
> > > > > #include "ext4_jbd2.h"
> > > > > #include "xattr.h"
> > > > > @@ -3279,6 +3280,8 @@ static int ext4_releasepage(struct page
> > > > > }
> > > > >
> > > > > /*
> > > > > + * O_DIRECT for ext3 (or indirect map) based files
> > > > > + *
> > > > > * If the O_DIRECT write will extend the file then add this inode to the
> > > > > * orphan list. So recovery will truncate it back to the original size
> > > > > * if the machine crashes during the write.
> > > > > @@ -3287,7 +3290,7 @@ static int ext4_releasepage(struct page
> > > > > * crashes then stale disk data _may_ be exposed inside the file. But current
> > > > > * VFS code falls back into buffered path in that case so we are safe.
> > > > > */
> > > > > -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
> > > > > +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
> > > > > const struct iovec *iov, loff_t offset,
> > > > > unsigned long nr_segs)
> > > > > {
> > > > > @@ -3361,6 +3364,212 @@ out:
> > > > > return ret;
> > > > > }
> > > > >
> > > > > +struct workqueue_struct *ext4_unwritten_queue;
> > > > > +
> > > > > +/* Maximum number of blocks we map for direct IO at once. */
> > > > > +
> > > > > +static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
> > > > > + struct buffer_head *bh_result, int create)
> > > > > +{
> > > > > + handle_t *handle = NULL;
> > > > > + int ret = 0;
> > > > > + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
> > > > > + int dio_credits;
> > > > > +
> > > > > + /*
> > > > > + * DIO VFS code passes create = 0 flag for write to
> > > > > + * the middle of file. It does this to avoid block
> > > > > + * allocation for holes, to prevent expose stale data
> > > > > + * out when there is parallel buffered read (which does
> > > > > + * not hold the i_mutex lock) while direct IO write has
> > > > > + * not completed. DIO request on holes finally falls back
> > > > > + * to buffered IO for this reason.
> > > > > + *
> > > > > + * For ext4 extent based file, since we support fallocate,
> > > > > + * new allocated extent as uninitialized, for holes, we
> > > > > + * could fallocate blocks for holes, thus parallel
> > > > > + * buffered IO read will zero out the page when read on
> > > > > + * a hole while parallel DIO write to the hole has not completed.
> > > > > + *
> > > > > + * when we come here, we know it's a direct IO write,
> > > > > + * so it's safe to override the create flag from VFS.
> > > > > + */
> > > > > + create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
> > > > > +
> > > > > + if (max_blocks > DIO_MAX_BLOCKS)
> > > > > + max_blocks = DIO_MAX_BLOCKS;
> > > > > + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
> > > > > + handle = ext4_journal_start(inode, dio_credits);
> > > > > + if (IS_ERR(handle)) {
> > > > > + ret = PTR_ERR(handle);
> > > > > + goto out;
> > > > > + }
> > > > > + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
> > > > > + create);
> > > > > + if (ret > 0) {
> > > > > + bh_result->b_size = (ret << inode->i_blkbits);
> > > > > + ret = 0;
> > > > > + }
> > > > > + ext4_journal_stop(handle);
> > > > > +out:
> > > > > + return ret;
> > > > > +}
> > > > > +
> > > > > +static int ext4_get_block_dio_read(struct inode *inode, sector_t iblock,
> > > > > + struct buffer_head *bh_result, int create)
> > > > > +{
> > > > > + int ret = 0;
> > > > > + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
> > > > > + handle_t *handle = NULL;
> > > > > +
> > > > > + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
> > > > > + create);
> > > > > + if (ret > 0) {
> > > > > + bh_result->b_size = (ret << inode->i_blkbits);
> > > > > + ret = 0;
> > > > > + }
> > > > > + return ret;
> > > > > +}
> > > > Huh, what's the purpose of the above function? We can use normal
> > > > get_block, can't we?
> > > >
> > >
> > > This is pretty much a wrapper of this normal get_block function, except
> > > here we need to store the space mapped in b_size, DIO will check that.
> > But ext4_get_block() will do exactly the same, won't it?
> >
> Ah, it does the same, with a little more check for "read" or "write"...I
> am fine to use ext4_get_block() directly. Will change it as you
> suggested.
>
> > > > > +
> > > > > +
> > > > > +#define DIO_UNWRITTEN 0x1
> > > > > +
> > > > > +/*
> > > > > + * IO write completion for unwritten extents.
> > > > > + *
> > > > > + * check a range of space and convert unwritten extents to written.
> > > > > + */
> > > > > +static void ext4_end_dio_unwritten(struct work_struct *work)
> > > > > +{
> > > > > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > > > > + struct inode *inode = io->inode;
> > > > > + loff_t offset = io->offset;
> > > > > + size_t size = io->size;
> > > > > + int ret = 0;
> > > > > +
> > > > > + ret = ext4_convert_unwritten_extents(inode, offset, size);
> > > > > +
> > > > > + if (ret < 0)
> > > > > + printk(KERN_EMERG "%s: failed to convert unwritten"
> > > > > + "extents to written extents, error is %d\n",
> > > > > + __func__, ret);
> > > > > + kfree(io);
> > > > > +}
> > > > Looking at ext4_convert_unwritten_extents(), you definitely miss some
> > > > locking. Since this is called completely asynchronously, you have to
> > > > protect against racing truncates and basically anything can happen with
> > > > the inode in the mean time.
> > >
> > > extents tree update is protected by the i_data_sem which will be hold at
> > > the ext4_get_blocks() called from ext4_convert_unwritten_extents.
> > Ah, I've missed that.
> >
> > > perhaps should grab the i_mutex() which protects the inode update?
> > Probably we should because we update i_disksize inside
> > ext4_convert_unwritten_extents(). BTW, where do we update i_size? The same
> > place where we update i_disksize is probably right and that definitely
> > needs i_mutex.
> >
>
> We do update i_size in generic_file_direct_write(), there I think proper
> locking (i_mutex lock) is hold.
>
> For i_disksize update, with this patch, we call ext4_update_i_disksize()
> from ext4_convert_unwritten_extents(), at the end of IO is completed.
> ext4_update_i_disksize() grab the i_data_sem to prevent race with
> truncate.
>
> Now I think we are fine with race with truncate... no?
I don't think we are fine - i_disksize gets updated without i_data_sem
being held in ext4_setattr() so you can race with it. I'm not sure how
exactly locking rules for ext4 look like wrt. i_disksize update but either
your code or ext4_setattr() need to be changed.
Actually, probably you should hold i_mutex until the io is finished -
otherwise a truncate against the file can be started before the io is
finished and actually nothing prevents it from finishing before your end_io
callback gets processed. Then the extent conversion will get confused I
expect because it won't find extents it should convert.
> > > > It needn't be cached in the memory anymore!
> > >
> > > Right, we probably need to increase the reference to the inode before
> > > submit the IO, so the inode would not be push out of cache before IO
> > > completed.
> > Yes, that's possible but then you have to be prepared to handle deletes
> > of an inode from your worker thread because it can drop the last reference
> > to an already deleted inode.
> >
> > > > Also fsync() has to flush all the updates for the inode it has in the
> > > > workqueue... Ditto for ext4_sync_fs().
> > > >
> > >
> > > I think we discussed about this before, and it seems there is no clear
> > > defination the DIO forces metadata update sync to disk before returns
> > > back to user apps... If we do force this in ext4 &DIO, doing fsync() on
> > > every DIO call is expensive.
> > No, I meant something else:
> > fd = open("file", O_DIRECT | O_RDWR);
> > write(fd, buf, size);
> > fsync(fd)
> >
> > After fsync(fd) we *must* have everything on disk and reachable even if
> > we crashed just after fsync(). So conversion of extents from uninitialized
> > to initialized must happen during fsync() and it does not so far because we
> > have no connection from an inode to the work queued to the worker thread.
> >
>
> Are you worried about AIO DIO case?
>
> Because for non AIO case, when DIO returns back to user, the data IO has
> already completed, and the conversion of extents from uninitialized to
> initialized has already being kicked by flush_workqueue(). Since the
> conversion is being journaled, the aftweward fsync() should able to
> commit all transactions, thus force the conversion to be complete on
> disk after fsync.
Yes, you are right, this case is fine.
> For the AIO DIO case, yeah I agree there might be a issue.... Hmm.. if
> we do fsync() after AIO DIO, currently I couldn't see any way fsync()
> could ensure all the pending data IOs from AIO DIO path would reach to
> disk...
>
> if there is a way fsycn() could wait for all pending data IOs from AIO
> DIO to complete, then end_io callback would able to trigger the
> conversion, and fsycn() would able to flush the metedata update hit to
> disk.
Well, thinking about this again, fsync() does not have to wait for
pending AIO - we never guaranteed anything about what fsync() does to such
requests. But once you complete the AIO (and thus userspace can know that
it is done), you have to make sure that fsync() flushes all the work
queued in the workqueue connected with this AIO.
> > > > > +
> > > > > +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int type)
> > > > > +{
> > > > > + ext4_io_end_t *io = NULL;
> > > > > +
> > > > > + io = kmalloc(sizeof(*io), GFP_NOFS);
> > > > > +
> > > > > + if (io) {
> > > > > + io->inode = inode;
> > > > > + io->type = type;
> > > > > + io->offset = 0;
> > > > > + io->size = 0;
> > > > > + io->error = 0;
> > > > > + INIT_WORK(&io->work, ext4_end_dio_unwritten);
> > > > > + }
> > > > > +
> > > > > + return io;
> > > > > +}
> > > > > +
> > > > > +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> > > > > + ssize_t size, void *private)
> > > > > +{
> > > > > + ext4_io_end_t *io_end = iocb->private;
> > > > > + struct workqueue_struct *wq;
> > > > > +
> > > > > + /* if not hole or unwritten extents, just simple return */
> > > > > + if (!io_end || !size)
> > > > > + return;
> > > > > + io_end->offset = offset;
> > > > > + io_end->size = size;
> > > > > + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> > > > > +
> > > > > + /* We need to convert unwritten extents to written */
> > > > > + queue_work(wq, &io_end->work);
> > > > > +
> > > > > + if (is_sync_kiocb(iocb))
> > > > > + flush_workqueue(wq);
> > > > I don't think you can flush_workqueue here. end_io is called from
> > > > interrupt context and flush_workqueue blocks for a long time...
> > > > The wait should be done in ext4_direct_IO IMHO...
> > > >
> > >
> > > Okay, I will move it to ext4_direct_IO(), hmm..I think that is fine with
> > > AIO path as the is_sync_kiocb(iocb) will avoid that.
> > Yes, this concerns standard IO.
> >
> > > If flush workqueue just kick off the work on the workqueue but not wait
> > > for it to complete (no fsync()), would that still be a big concern? BTW,
> > Waking up the worker thread is certainly fine even from the end_io.
> >
>
> Okay, I guess I didn't explain this well with my patch. The
> flush_workqueue() here will call ext4_convert_unwritten_extents(), which
> just does the conversion, but ext4_convert_unwritten_extents() does not
> force the journal commit.
>
> > > the flush workqueue only called when file is opened with sync mode.
> > I don't think so - is_sync_kiocb() just means that it is a standard
> > read/write and not one submitted via the aio interface (io_submit syscall
> > etc.).
> >
>
> ah okay, I responsed too soon. yeah, for the DIO regular case (even if
> file doesn't open with sync mode), we do ensure the worker thread kick
> off the conversion and log the conversion. That's right behavior, I
> think.
Yes, this part of your patch is correct. Ted explained to me what I have
missed.
> > > > > +
> > > > > + iocb->private = NULL;
> > > > > +}
> > > > > +/*
> > > > > + * For ext4 extent files, ext4 will do direct-io write to holes,
> > > > > + * preallocated extents, and those write extend the file, no need to
> > > > > + * fall back to buffered IO.
> > > > > + *
> > > > > + * If there is block allocation needed(holes, EOF write), we fallocate
> > > > > + * those blocks, mark them as unintialized
> > > > > + * If those blocks were preallocated, we mark sure they are splited, but
> > > > > + * still keep the range to write as unintialized.
> > > > > + *
> > > > > + * When end_io call back function called at the last IO complete time,
> > > > > + * those extents will be converted to written extents.
> > > > > + *
> > > > > + */
> > > > > +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> > > > > + const struct iovec *iov, loff_t offset,
> > > > > + unsigned long nr_segs)
> > > > > +{
> > > > > + struct file *file = iocb->ki_filp;
> > > > > + struct inode *inode = file->f_mapping->host;
> > > > > + ssize_t ret;
> > > > > +
> > > > > + if (rw == WRITE) {
> > > > > + /*
> > > > > + * For DIO we fallocate blocks for holes and end of file
> > > > > + * write. Those fallocated extents are marked as uninitialized
> > > > > + * to prevent paralel buffered read to expose the stale data
> > > > > + * before DIO complete the data IO.
> > > > > + * as for previously fallocated extents, ext4 get_block
> > > > > + * will just simply mark the buffer mapped but still
> > > > > + * keep the extents uninitialized.
> > > > > + *
> > > > > + * At the end of IO, the ext4 end_io callback function
> > > > > + * will convert those unwritten extents to written,
> > > > > + * and update on disk file size if the DIO expands the file.
> > > > > + *
> > > > > + */
> > > > > + iocb->private = ext4_init_io_end(inode, DIO_UNWRITTEN);
> > > > > + if (!iocb->private)
> > > > > + return -ENOMEM;
> > > > > +
> > > > > + ret = blockdev_direct_IO(rw, iocb, inode,
> > > > > + inode->i_sb->s_bdev, iov,
> > > > > + offset, nr_segs,
> > > > > + ext4_get_block_dio_write,
> > > > > + ext4_end_io_dio);
> > > > > + } else
> > > > > + ret = blockdev_direct_IO(rw, iocb, inode,
> > > > > + inode->i_sb->s_bdev, iov,
> > > > > + offset, nr_segs,
> > > > > + ext4_get_block_dio_read, NULL);
> > > > > +
> > > > > + /*
> > > > > + * In the case of AIO DIO, VFS dio submitted the IO, but it
> > > > > + * does not wait for io complete. To prevent expose stale
> > > > > + * data after crash before IO complete,
> > > > > + * i_disksize needs to be updated at the
> > > > > + * time all the IO is completed, not here
> > > > > + */
> > > > Yeah, but so far at least ext3_direct_IO and ext4_ind_direct_IO
> > > > happily update i_size here and noone reported the problem (yet) ;). The
> > > > thing is that the current transaction has to commit for stale data to be
> > > > visible and that sends a barrier which also forces blocks written by
> > > > direct IO to the persistent storage. So at least if the underlying
> > > > storage supports barriers, we are fine. If it does not, it could maybe
> > > > reorder direct IO writes after the journal commit (it need not have
> > > > reported the direct IO as done so far) and after a crash we would have
> > > > a problem...
> > > > It's just that I'm not sure that all the trouble with end_io and
> > > > workqueues is worth it... More code = more bugs ;)
> > >
> > > the end_io is there for direct write to fallocate. I am completely sure
> > > if there is better way? We need to convert the extents to written at the
> > > end of IO is completed, end_io call back seems works for this purpose.
> > >
> > > The workqueue is for AIO case,it would be nice if we could do without it
> > > for AIO. But read comments from xfs code it seems this is needed if we
> > > use end_io call back.
> > Yes, end_io callback is useful for triggering the conversion or
> > acknowledging that the transaction with the conversion can be committed
> > and I'm not against using it. Actually, looking at the XFS code, they also
> > do flush_workqueue() from the end_io callback. Interesting. I guess I'll
> > ask them why it's not a problem...
> > What I'm concerned about is the fact that we'd do the conversion from
> > completely asynchronous context and that opens all sorts of races.
>
> Yeah in the AIO case we do the conversion from asynchronous context,
> but for the non AIO case, the conversion is done under process context
> still. I suspect the AIO DIO today for ext3 update the on disk size
> before IO is complete is a bigger issue...:)
Well, ext3 has a problem that if you do AIO DIO and power fails after
the transaction is committed but before the data really gets written, then
we expose stale data after a crash. I didn't see a single report of this
but in theory the race is there.
Your patch closes this hole but we should better not introduce other
races :).
> > So I'd
> > find simpler to do the conversion from ext4_direct_IO and just make sure the
> > transaction is not committed before the IO is done (essentially what we
> > currently do with ordered buffers).
> >
>
> True, that might workable....I am concerned this won't work for no
> journal mode, though this mode is not commonly used now.:)
Well, if you are running without a journal, you can see stale data after
a crash - there's no way around it. So just not waiting for anything works
well.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Tue, 2009-08-25 at 16:01 +0200, Jan Kara wrote:
> On Mon 24-08-09 14:38:13, Mingming wrote:
> > On Thu, 2009-08-20 at 13:52 +0200, Jan Kara wrote:
> > > On Wed 19-08-09 14:26:16, Mingming wrote:
> > > > On Wed, 2009-08-19 at 16:15 +0200, Jan Kara wrote:
> > > > > > For direct IO write to the end of file, we now also could get rid of
> > > > > > using orphan list to protect expose stale data out after crash, when
> > > > > > direct write to end of file isn't complete. We now fallocate blocks for
> > > > > > the direct IO write to the end of file as well, and convert those
> > > > > > fallocated blocks at the end of file to written when IO is complete. If
> > > > > > fs crashed before the IO complete, it will only seen the file tail has
> > > > > > been fallocated but won't get the stale data out.
> > > > > But you still probably need orphan list to truncate blocks allocated
> > > > > beyond file end during extending direct write. So I'd just remove this
> > > > > paragraph...
> > > > >
> > > >
> > > > Do we still need to truncate blocks allocated at the end of file? Those
> > > > blocks are marked as uninitialized extents (as any block allocation from
> > > > DIO are flagged as uninitialized extents, before IO is complete), so
> > > > that after recover from crash, the stale data won't get exposed.
> > > >
> > > > I guess I missed the cases you concerned that we need the orphan list to
> > > > protect, could you plain a little more?
> > > Well, you are right it's not a security problem since no data gets
> > > exposed. It's just that after a crash, there will be blocks allocated to
> > > a file and it's not trivial to it find out unless you specifically stat the
> > > file. So it's just *not nice* kind of thing. If it brings us some
> > > advantage to not put the inode on the orphan list, then sure it might be
> > > a tradeoff we are willing to make. But otherwise I see no point.
> > >
> >
> > Hmm... I see what you concerned now...user probably don't like allocated
> > blocks at the end...
> Yes, it's kind of counterintuitive that blocks can get allocated but
> don't really "show up" in the file (because they are unwritten and beyond
> i_size).
>
> > I am trying to think of the ext3 case. In ext3 case, inode will be
> > removed from orphan list once the IO is complete, but that is before the
> > i_disksize get updated and journal handle being closed. What if the
> > system crash after inode removed from orphan list but before the
> > i_disksize get updated?
> As you write below, until we stop the handle, the transaction cannot be
> committed and so no change actually gets written to disk.
>
Then in case this, if no change actually gets written to disk, then
i_disksize hasn't get updated on disk, why we need the orphan list?
> > But since the journal handled has not marked as stopped, even without
> > putting the inode on the orpah list, wouldn't the open journal handle
> > and delayed i_disksize update until IO complete time, prevent we see
> > half DIO data on disk? Though blocks are allocated to the file, but
> > those block mapping wouldn't show up on disk, no? Did I miss anything?
> >
> > In the ext4 +patch case (non AIO case), we also close the handle when
> > end_io completed. If the system crashed we would see the blocks are
> > allocated but marked as unwritten.
> Exactly, you end up with allocated and unwritten blocks beyond i_size and
> that's what I'd like to avoid if user did not explicitely fallocate() these
> blocks.
>
Yes but same as ext3, nothing should write to disk until we stop the
handle, so the unwritten blocks flag should also been seen as the
i_disksize has not been updated on disk yet.
> > > > > > 1) Block allocation needed for DIO write are fallocated, including holes
> > > > > > and file tail write, marked as unwritten extents after block allocation.
> > > > > >
> > > > > > 2) those unwritten extents, and fallocate extents, will be converted to
> > > > > > written extents (and update disk size when write to end of file)when the
> > > > > > IO is complete. The conversion is triggered using end_io call back
> > > > > > function passing from ext4 fs to direct IO.
> > > > > >
> > > > > > 3) For already fallocated extent, at the time try to map the fallocated
> > > > > > extent, we will split the fallocated extent as necessary, mark the
> > > > > > to-write fallocated extent mapped but still remains unwritten,
> > > > > > insert the splitted extents, to prevent ENOSPC later.
> > > > > >
> > > > > > This first patch does 1) and 2), the second patch does 3)
> > > > > >
> > > > > > Patch against ext4 patch queue.
> > > > > >
> > > > > > Comments?
> > > > > >
> > > > > > Singed-Off-By: Mingming Cao <[email protected]>
> > > > > > ---
> > > > > > fs/ext4/ext4.h | 18 ++++
> > > > > > fs/ext4/inode.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > > fs/ext4/super.c | 11 ++
> > > > > > 3 files changed, 237 insertions(+), 3 deletions(-)
> > > > > >
> > > > > > Index: linux-2.6.31-rc4/fs/ext4/ext4.h
> > > > > > ===================================================================
> > > > > > --- linux-2.6.31-rc4.orig/fs/ext4/ext4.h 2009-08-09 14:46:10.000000000 -0700
> > > > > > +++ linux-2.6.31-rc4/fs/ext4/ext4.h 2009-08-09 23:13:15.000000000 -0700
> > > > > > @@ -111,6 +111,15 @@ struct ext4_allocation_request {
> > > > > > unsigned int flags;
> > > > > > };
> > > > > >
> > > > > > +typedef struct ext4_io_end{
> > > > > > + struct inode *inode; /* file being written to */
> > > > > > + unsigned int type; /* unwritten or written */
> > > > > > + int error; /* I/O error code */
> > > > > > + ext4_lblk_t offset; /* offset in the file */
> > > > > > + size_t size; /* size of the extent */
> > > > > > + struct work_struct work; /* data work queue */
> > > > > > +}ext4_io_end_t;
> > > > > > +
> > > > > > /*
> > > > > > * Special inodes numbers
> > > > > > */
> > > > > > @@ -330,8 +339,8 @@ struct ext4_new_group_data {
> > > > > > /* Call ext4_da_update_reserve_space() after successfully
> > > > > > allocating the blocks */
> > > > > > #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
> > > > > > -
> > > > > > -
> > > > > > +#define EXT4_GET_BLOCKS_DIO_CREATE_EXT 0x0011
> > > > > > +#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT 0x0021
> > > > > > /*
> > > > > > * ioctl commands
> > > > > > */
> > > > > > @@ -960,6 +969,9 @@ struct ext4_sb_info {
> > > > > >
> > > > > > unsigned int s_log_groups_per_flex;
> > > > > > struct flex_groups *s_flex_groups;
> > > > > > +
> > > > > > + /* workqueue for dio unwritten */
> > > > > > + struct workqueue_struct *dio_unwritten_wq;
> > > > > > };
> > > > > >
> > > > > > static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> > > > > > @@ -1650,6 +1662,8 @@ extern void ext4_ext_init(struct super_b
> > > > > > extern void ext4_ext_release(struct super_block *);
> > > > > > extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
> > > > > > loff_t len);
> > > > > > +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
> > > > > > + loff_t len);
> > > > > > extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
> > > > > > sector_t block, unsigned int max_blocks,
> > > > > > struct buffer_head *bh, int flags);
> > > > > > Index: linux-2.6.31-rc4/fs/ext4/super.c
> > > > > > ===================================================================
> > > > > > --- linux-2.6.31-rc4.orig/fs/ext4/super.c 2009-08-09 14:51:08.000000000 -0700
> > > > > > +++ linux-2.6.31-rc4/fs/ext4/super.c 2009-08-09 14:51:14.000000000 -0700
> > > > > > @@ -578,6 +578,9 @@ static void ext4_put_super(struct super_
> > > > > > struct ext4_super_block *es = sbi->s_es;
> > > > > > int i, err;
> > > > > >
> > > > > > + flush_workqueue(sbi->dio_unwritten_wq);
> > > > > > + destroy_workqueue(sbi->dio_unwritten_wq);
> > > > > > +
> > > > > > lock_super(sb);
> > > > > > lock_kernel();
> > > > > > if (sb->s_dirt)
> > > > > > @@ -2781,6 +2784,12 @@ no_journal:
> > > > > > clear_opt(sbi->s_mount_opt, NOBH);
> > > > > > }
> > > > > > }
> > > > > > + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
> > > > > > + if (!EXT4_SB(sb)->dio_unwritten_wq) {
> > > > > > + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
> > > > > > + goto failed_mount_wq;
> > > > > > + }
> > > > > > +
> > > > > > /*
> > > > > > * The jbd2_journal_load will have done any necessary log recovery,
> > > > > > * so we can safely mount the rest of the filesystem now.
> > > > > > @@ -2893,6 +2902,8 @@ cantfind_ext4:
> > > > > >
> > > > > > failed_mount4:
> > > > > > ext4_msg(sb, KERN_ERR, "mount failed");
> > > > > > + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
> > > > > > +failed_mount_wq:
> > > > > > ext4_release_system_zone(sb);
> > > > > > if (sbi->s_journal) {
> > > > > > jbd2_journal_destroy(sbi->s_journal);
> > > > > > Index: linux-2.6.31-rc4/fs/ext4/inode.c
> > > > > > ===================================================================
> > > > > > --- linux-2.6.31-rc4.orig/fs/ext4/inode.c 2009-08-09 14:46:32.000000000 -0700
> > > > > > +++ linux-2.6.31-rc4/fs/ext4/inode.c 2009-08-09 14:56:40.000000000 -0700
> > > > > > @@ -37,6 +37,7 @@
> > > > > > #include <linux/namei.h>
> > > > > > #include <linux/uio.h>
> > > > > > #include <linux/bio.h>
> > > > > > +#include <linux/workqueue.h>
> > > > > >
> > > > > > #include "ext4_jbd2.h"
> > > > > > #include "xattr.h"
> > > > > > @@ -3279,6 +3280,8 @@ static int ext4_releasepage(struct page
> > > > > > }
> > > > > >
> > > > > > /*
> > > > > > + * O_DIRECT for ext3 (or indirect map) based files
> > > > > > + *
> > > > > > * If the O_DIRECT write will extend the file then add this inode to the
> > > > > > * orphan list. So recovery will truncate it back to the original size
> > > > > > * if the machine crashes during the write.
> > > > > > @@ -3287,7 +3290,7 @@ static int ext4_releasepage(struct page
> > > > > > * crashes then stale disk data _may_ be exposed inside the file. But current
> > > > > > * VFS code falls back into buffered path in that case so we are safe.
> > > > > > */
> > > > > > -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
> > > > > > +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
> > > > > > const struct iovec *iov, loff_t offset,
> > > > > > unsigned long nr_segs)
> > > > > > {
> > > > > > @@ -3361,6 +3364,212 @@ out:
> > > > > > return ret;
> > > > > > }
> > > > > >
> > > > > > +struct workqueue_struct *ext4_unwritten_queue;
> > > > > > +
> > > > > > +/* Maximum number of blocks we map for direct IO at once. */
> > > > > > +
> > > > > > +static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
> > > > > > + struct buffer_head *bh_result, int create)
> > > > > > +{
> > > > > > + handle_t *handle = NULL;
> > > > > > + int ret = 0;
> > > > > > + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
> > > > > > + int dio_credits;
> > > > > > +
> > > > > > + /*
> > > > > > + * DIO VFS code passes create = 0 flag for write to
> > > > > > + * the middle of file. It does this to avoid block
> > > > > > + * allocation for holes, to prevent expose stale data
> > > > > > + * out when there is parallel buffered read (which does
> > > > > > + * not hold the i_mutex lock) while direct IO write has
> > > > > > + * not completed. DIO request on holes finally falls back
> > > > > > + * to buffered IO for this reason.
> > > > > > + *
> > > > > > + * For ext4 extent based file, since we support fallocate,
> > > > > > + * new allocated extent as uninitialized, for holes, we
> > > > > > + * could fallocate blocks for holes, thus parallel
> > > > > > + * buffered IO read will zero out the page when read on
> > > > > > + * a hole while parallel DIO write to the hole has not completed.
> > > > > > + *
> > > > > > + * when we come here, we know it's a direct IO write,
> > > > > > + * so it's safe to override the create flag from VFS.
> > > > > > + */
> > > > > > + create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
> > > > > > +
> > > > > > + if (max_blocks > DIO_MAX_BLOCKS)
> > > > > > + max_blocks = DIO_MAX_BLOCKS;
> > > > > > + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
> > > > > > + handle = ext4_journal_start(inode, dio_credits);
> > > > > > + if (IS_ERR(handle)) {
> > > > > > + ret = PTR_ERR(handle);
> > > > > > + goto out;
> > > > > > + }
> > > > > > + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
> > > > > > + create);
> > > > > > + if (ret > 0) {
> > > > > > + bh_result->b_size = (ret << inode->i_blkbits);
> > > > > > + ret = 0;
> > > > > > + }
> > > > > > + ext4_journal_stop(handle);
> > > > > > +out:
> > > > > > + return ret;
> > > > > > +}
> > > > > > +
> > > > > > +static int ext4_get_block_dio_read(struct inode *inode, sector_t iblock,
> > > > > > + struct buffer_head *bh_result, int create)
> > > > > > +{
> > > > > > + int ret = 0;
> > > > > > + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
> > > > > > + handle_t *handle = NULL;
> > > > > > +
> > > > > > + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
> > > > > > + create);
> > > > > > + if (ret > 0) {
> > > > > > + bh_result->b_size = (ret << inode->i_blkbits);
> > > > > > + ret = 0;
> > > > > > + }
> > > > > > + return ret;
> > > > > > +}
> > > > > Huh, what's the purpose of the above function? We can use normal
> > > > > get_block, can't we?
> > > > >
> > > >
> > > > This is pretty much a wrapper of this normal get_block function, except
> > > > here we need to store the space mapped in b_size, DIO will check that.
> > > But ext4_get_block() will do exactly the same, won't it?
> > >
> > Ah, it does the same, with a little more check for "read" or "write"...I
> > am fine to use ext4_get_block() directly. Will change it as you
> > suggested.
> >
> > > > > > +
> > > > > > +
> > > > > > +#define DIO_UNWRITTEN 0x1
> > > > > > +
> > > > > > +/*
> > > > > > + * IO write completion for unwritten extents.
> > > > > > + *
> > > > > > + * check a range of space and convert unwritten extents to written.
> > > > > > + */
> > > > > > +static void ext4_end_dio_unwritten(struct work_struct *work)
> > > > > > +{
> > > > > > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > > > > > + struct inode *inode = io->inode;
> > > > > > + loff_t offset = io->offset;
> > > > > > + size_t size = io->size;
> > > > > > + int ret = 0;
> > > > > > +
> > > > > > + ret = ext4_convert_unwritten_extents(inode, offset, size);
> > > > > > +
> > > > > > + if (ret < 0)
> > > > > > + printk(KERN_EMERG "%s: failed to convert unwritten"
> > > > > > + "extents to written extents, error is %d\n",
> > > > > > + __func__, ret);
> > > > > > + kfree(io);
> > > > > > +}
> > > > > Looking at ext4_convert_unwritten_extents(), you definitely miss some
> > > > > locking. Since this is called completely asynchronously, you have to
> > > > > protect against racing truncates and basically anything can happen with
> > > > > the inode in the mean time.
> > > >
> > > > extents tree update is protected by the i_data_sem which will be hold at
> > > > the ext4_get_blocks() called from ext4_convert_unwritten_extents.
> > > Ah, I've missed that.
> > >
> > > > perhaps should grab the i_mutex() which protects the inode update?
> > > Probably we should because we update i_disksize inside
> > > ext4_convert_unwritten_extents(). BTW, where do we update i_size? The same
> > > place where we update i_disksize is probably right and that definitely
> > > needs i_mutex.
> > >
> >
> > We do update i_size in generic_file_direct_write(), there I think proper
> > locking (i_mutex lock) is hold.
> >
> > For i_disksize update, with this patch, we call ext4_update_i_disksize()
> > from ext4_convert_unwritten_extents(), at the end of IO is completed.
> > ext4_update_i_disksize() grab the i_data_sem to prevent race with
> > truncate.
> >
> > Now I think we are fine with race with truncate... no?
> I don't think we are fine - i_disksize gets updated without i_data_sem
> being held in ext4_setattr() so you can race with it. I'm not sure how
> exactly locking rules for ext4 look like wrt. i_disksize update but either
> your code or ext4_setattr() need to be changed.
then ext4_setattr() race with other places using
ext4_update_i_disksize() (e.g. fallocate) already. we need to fix
ext4_setattr and probably other places where i_disksize is updated
without holding i_data_sem.
> Actually, probably you should hold i_mutex until the io is finished -
> otherwise a truncate against the file can be started before the io is
> finished and actually nothing prevents it from finishing before your end_io
> callback gets processed. Then the extent conversion will get confused I
> expect because it won't find extents it should convert.
>
DIO already did that:) i_mutex is hold during the whole direct IO,
before return back to user, in the non AIO case.
> > > > > It needn't be cached in the memory anymore!
> > > >
> > > > Right, we probably need to increase the reference to the inode before
> > > > submit the IO, so the inode would not be push out of cache before IO
> > > > completed.
> > > Yes, that's possible but then you have to be prepared to handle deletes
> > > of an inode from your worker thread because it can drop the last reference
> > > to an already deleted inode.
> > >
> > > > > Also fsync() has to flush all the updates for the inode it has in the
> > > > > workqueue... Ditto for ext4_sync_fs().
> > > > >
> > > >
> > > > I think we discussed about this before, and it seems there is no clear
> > > > defination the DIO forces metadata update sync to disk before returns
> > > > back to user apps... If we do force this in ext4 &DIO, doing fsync() on
> > > > every DIO call is expensive.
> > > No, I meant something else:
> > > fd = open("file", O_DIRECT | O_RDWR);
> > > write(fd, buf, size);
> > > fsync(fd)
> > >
> > > After fsync(fd) we *must* have everything on disk and reachable even if
> > > we crashed just after fsync(). So conversion of extents from uninitialized
> > > to initialized must happen during fsync() and it does not so far because we
> > > have no connection from an inode to the work queued to the worker thread.
> > >
> >
> > Are you worried about AIO DIO case?
> >
> > Because for non AIO case, when DIO returns back to user, the data IO has
> > already completed, and the conversion of extents from uninitialized to
> > initialized has already being kicked by flush_workqueue(). Since the
> > conversion is being journaled, the aftweward fsync() should able to
> > commit all transactions, thus force the conversion to be complete on
> > disk after fsync.
> Yes, you are right, this case is fine.
>
> > For the AIO DIO case, yeah I agree there might be a issue.... Hmm.. if
> > we do fsync() after AIO DIO, currently I couldn't see any way fsync()
> > could ensure all the pending data IOs from AIO DIO path would reach to
> > disk...
> >
> > if there is a way fsycn() could wait for all pending data IOs from AIO
> > DIO to complete, then end_io callback would able to trigger the
> > conversion, and fsycn() would able to flush the metedata update hit to
> > disk.
> Well, thinking about this again, fsync() does not have to wait for
> pending AIO - we never guaranteed anything about what fsync() does to such
> requests. But once you complete the AIO (and thus userspace can know that
> it is done), you have to make sure that fsync() flushes all the work
> queued in the workqueue connected with this AIO.
>
I just realize, why don't we flush the work in the workqueue with this
AIO at the IO completion time (but not ext4_direct_IO time)? Since the
work just does conversion but not the full journal commit. That would
work if there is fsync() after the AIO is completed.
> > > > > > +
> > > > > > +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int type)
> > > > > > +{
> > > > > > + ext4_io_end_t *io = NULL;
> > > > > > +
> > > > > > + io = kmalloc(sizeof(*io), GFP_NOFS);
> > > > > > +
> > > > > > + if (io) {
> > > > > > + io->inode = inode;
> > > > > > + io->type = type;
> > > > > > + io->offset = 0;
> > > > > > + io->size = 0;
> > > > > > + io->error = 0;
> > > > > > + INIT_WORK(&io->work, ext4_end_dio_unwritten);
> > > > > > + }
> > > > > > +
> > > > > > + return io;
> > > > > > +}
> > > > > > +
> > > > > > +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> > > > > > + ssize_t size, void *private)
> > > > > > +{
> > > > > > + ext4_io_end_t *io_end = iocb->private;
> > > > > > + struct workqueue_struct *wq;
> > > > > > +
> > > > > > + /* if not hole or unwritten extents, just simple return */
> > > > > > + if (!io_end || !size)
> > > > > > + return;
> > > > > > + io_end->offset = offset;
> > > > > > + io_end->size = size;
> > > > > > + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> > > > > > +
> > > > > > + /* We need to convert unwritten extents to written */
> > > > > > + queue_work(wq, &io_end->work);
> > > > > > +
> > > > > > + if (is_sync_kiocb(iocb))
> > > > > > + flush_workqueue(wq);
> > > > > I don't think you can flush_workqueue here. end_io is called from
> > > > > interrupt context and flush_workqueue blocks for a long time...
> > > > > The wait should be done in ext4_direct_IO IMHO...
> > > > >
> > > >
> > > > Okay, I will move it to ext4_direct_IO(), hmm..I think that is fine with
> > > > AIO path as the is_sync_kiocb(iocb) will avoid that.
> > > Yes, this concerns standard IO.
> > >
> > > > If flush workqueue just kick off the work on the workqueue but not wait
> > > > for it to complete (no fsync()), would that still be a big concern? BTW,
> > > Waking up the worker thread is certainly fine even from the end_io.
> > >
> >
> > Okay, I guess I didn't explain this well with my patch. The
> > flush_workqueue() here will call ext4_convert_unwritten_extents(), which
> > just does the conversion, but ext4_convert_unwritten_extents() does not
> > force the journal commit.
> >
> > > > the flush workqueue only called when file is opened with sync mode.
> > > I don't think so - is_sync_kiocb() just means that it is a standard
> > > read/write and not one submitted via the aio interface (io_submit syscall
> > > etc.).
> > >
> >
> > ah okay, I responsed too soon. yeah, for the DIO regular case (even if
> > file doesn't open with sync mode), we do ensure the worker thread kick
> > off the conversion and log the conversion. That's right behavior, I
> > think.
> Yes, this part of your patch is correct. Ted explained to me what I have
> missed.
>
> > > > > > +
> > > > > > + iocb->private = NULL;
> > > > > > +}
> > > > > > +/*
> > > > > > + * For ext4 extent files, ext4 will do direct-io write to holes,
> > > > > > + * preallocated extents, and those write extend the file, no need to
> > > > > > + * fall back to buffered IO.
> > > > > > + *
> > > > > > + * If there is block allocation needed(holes, EOF write), we fallocate
> > > > > > + * those blocks, mark them as unintialized
> > > > > > + * If those blocks were preallocated, we mark sure they are splited, but
> > > > > > + * still keep the range to write as unintialized.
> > > > > > + *
> > > > > > + * When end_io call back function called at the last IO complete time,
> > > > > > + * those extents will be converted to written extents.
> > > > > > + *
> > > > > > + */
> > > > > > +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> > > > > > + const struct iovec *iov, loff_t offset,
> > > > > > + unsigned long nr_segs)
> > > > > > +{
> > > > > > + struct file *file = iocb->ki_filp;
> > > > > > + struct inode *inode = file->f_mapping->host;
> > > > > > + ssize_t ret;
> > > > > > +
> > > > > > + if (rw == WRITE) {
> > > > > > + /*
> > > > > > + * For DIO we fallocate blocks for holes and end of file
> > > > > > + * write. Those fallocated extents are marked as uninitialized
> > > > > > + * to prevent paralel buffered read to expose the stale data
> > > > > > + * before DIO complete the data IO.
> > > > > > + * as for previously fallocated extents, ext4 get_block
> > > > > > + * will just simply mark the buffer mapped but still
> > > > > > + * keep the extents uninitialized.
> > > > > > + *
> > > > > > + * At the end of IO, the ext4 end_io callback function
> > > > > > + * will convert those unwritten extents to written,
> > > > > > + * and update on disk file size if the DIO expands the file.
> > > > > > + *
> > > > > > + */
> > > > > > + iocb->private = ext4_init_io_end(inode, DIO_UNWRITTEN);
> > > > > > + if (!iocb->private)
> > > > > > + return -ENOMEM;
> > > > > > +
> > > > > > + ret = blockdev_direct_IO(rw, iocb, inode,
> > > > > > + inode->i_sb->s_bdev, iov,
> > > > > > + offset, nr_segs,
> > > > > > + ext4_get_block_dio_write,
> > > > > > + ext4_end_io_dio);
> > > > > > + } else
> > > > > > + ret = blockdev_direct_IO(rw, iocb, inode,
> > > > > > + inode->i_sb->s_bdev, iov,
> > > > > > + offset, nr_segs,
> > > > > > + ext4_get_block_dio_read, NULL);
> > > > > > +
> > > > > > + /*
> > > > > > + * In the case of AIO DIO, VFS dio submitted the IO, but it
> > > > > > + * does not wait for io complete. To prevent expose stale
> > > > > > + * data after crash before IO complete,
> > > > > > + * i_disksize needs to be updated at the
> > > > > > + * time all the IO is completed, not here
> > > > > > + */
> > > > > Yeah, but so far at least ext3_direct_IO and ext4_ind_direct_IO
> > > > > happily update i_size here and noone reported the problem (yet) ;). The
> > > > > thing is that the current transaction has to commit for stale data to be
> > > > > visible and that sends a barrier which also forces blocks written by
> > > > > direct IO to the persistent storage. So at least if the underlying
> > > > > storage supports barriers, we are fine. If it does not, it could maybe
> > > > > reorder direct IO writes after the journal commit (it need not have
> > > > > reported the direct IO as done so far) and after a crash we would have
> > > > > a problem...
> > > > > It's just that I'm not sure that all the trouble with end_io and
> > > > > workqueues is worth it... More code = more bugs ;)
> > > >
> > > > the end_io is there for direct write to fallocate. I am completely sure
> > > > if there is better way? We need to convert the extents to written at the
> > > > end of IO is completed, end_io call back seems works for this purpose.
> > > >
> > > > The workqueue is for AIO case,it would be nice if we could do without it
> > > > for AIO. But read comments from xfs code it seems this is needed if we
> > > > use end_io call back.
> > > Yes, end_io callback is useful for triggering the conversion or
> > > acknowledging that the transaction with the conversion can be committed
> > > and I'm not against using it. Actually, looking at the XFS code, they also
> > > do flush_workqueue() from the end_io callback. Interesting. I guess I'll
> > > ask them why it's not a problem...
> > > What I'm concerned about is the fact that we'd do the conversion from
> > > completely asynchronous context and that opens all sorts of races.
> >
> > Yeah in the AIO case we do the conversion from asynchronous context,
> > but for the non AIO case, the conversion is done under process context
> > still. I suspect the AIO DIO today for ext3 update the on disk size
> > before IO is complete is a bigger issue...:)
> Well, ext3 has a problem that if you do AIO DIO and power fails after
> the transaction is committed but before the data really gets written, then
> we expose stale data after a crash. I didn't see a single report of this
> but in theory the race is there.
> Your patch closes this hole but we should better not introduce other
> races :).
>
> > > So I'd
> > > find simpler to do the conversion from ext4_direct_IO and just make sure the
> > > transaction is not committed before the IO is done (essentially what we
> > > currently do with ordered buffers).
> > >
> >
> > True, that might workable....I am concerned this won't work for no
> > journal mode, though this mode is not commonly used now.:)
> Well, if you are running without a journal, you can see stale data after
> a crash - there's no way around it. So just not waiting for anything works
> well.
>
But in the direct IO case, with the end_io call back, we won't update
i_disksize until IO is completed, you should not see stale data after a
crash, even without journal, am I miss anything?
Mingming
> Honza
On Tue, 2009-08-25 at 12:41 -0700, Mingming wrote:
> On Tue, 2009-08-25 at 16:01 +0200, Jan Kara wrote:
> > On Mon 24-08-09 14:38:13, Mingming wrote:
> > > On Thu, 2009-08-20 at 13:52 +0200, Jan Kara wrote:
> > > > On Wed 19-08-09 14:26:16, Mingming wrote:
> > > > > On Wed, 2009-08-19 at 16:15 +0200, Jan Kara wrote:
> > > > > > > For direct IO write to the end of file, we now also could get rid of
> > > > > > > using orphan list to protect expose stale data out after crash, when
> > > > > > > direct write to end of file isn't complete. We now fallocate blocks for
> > > > > > > the direct IO write to the end of file as well, and convert those
> > > > > > > fallocated blocks at the end of file to written when IO is complete. If
> > > > > > > fs crashed before the IO complete, it will only seen the file tail has
> > > > > > > been fallocated but won't get the stale data out.
> > > > > > But you still probably need orphan list to truncate blocks allocated
> > > > > > beyond file end during extending direct write. So I'd just remove this
> > > > > > paragraph...
> > > > > >
> > > > >
> > > > > Do we still need to truncate blocks allocated at the end of file? Those
> > > > > blocks are marked as uninitialized extents (as any block allocation from
> > > > > DIO are flagged as uninitialized extents, before IO is complete), so
> > > > > that after recover from crash, the stale data won't get exposed.
> > > > >
> > > > > I guess I missed the cases you concerned that we need the orphan list to
> > > > > protect, could you plain a little more?
> > > > Well, you are right it's not a security problem since no data gets
> > > > exposed. It's just that after a crash, there will be blocks allocated to
> > > > a file and it's not trivial to it find out unless you specifically stat the
> > > > file. So it's just *not nice* kind of thing. If it brings us some
> > > > advantage to not put the inode on the orphan list, then sure it might be
> > > > a tradeoff we are willing to make. But otherwise I see no point.
> > > >
> > >
> > > Hmm... I see what you concerned now...user probably don't like allocated
> > > blocks at the end...
> > Yes, it's kind of counterintuitive that blocks can get allocated but
> > don't really "show up" in the file (because they are unwritten and beyond
> > i_size).
> >
> > > I am trying to think of the ext3 case. In ext3 case, inode will be
> > > removed from orphan list once the IO is complete, but that is before the
> > > i_disksize get updated and journal handle being closed. What if the
> > > system crash after inode removed from orphan list but before the
> > > i_disksize get updated?
> > As you write below, until we stop the handle, the transaction cannot be
> > committed and so no change actually gets written to disk.
> >
>
> Then in case this, if no change actually gets written to disk, then
> i_disksize hasn't get updated on disk, why we need the orphan list?
>
Never mind, I missed the fact that i_size was updated in ext3_direct_IO
before submit the IO, so orphan list is there to truncate i_size to
i_disksize in case of crash.
> > > But since the journal handled has not marked as stopped, even without
> > > putting the inode on the orpah list, wouldn't the open journal handle
> > > and delayed i_disksize update until IO complete time, prevent we see
> > > half DIO data on disk? Though blocks are allocated to the file, but
> > > those block mapping wouldn't show up on disk, no? Did I miss anything?
> > >
> > > In the ext4 +patch case (non AIO case), we also close the handle when
> > > end_io completed. If the system crashed we would see the blocks are
> > > allocated but marked as unwritten.
> > Exactly, you end up with allocated and unwritten blocks beyond i_size and
> > that's what I'd like to avoid if user did not explicitely fallocate() these
> > blocks.
> >
>
> Yes but same as ext3, nothing should write to disk until we stop the
> handle, so the unwritten blocks flag should also been seen as the
> i_disksize has not been updated on disk yet.
>
Please ignore this, I see what you are saying..
I think it over today, I understand the race is with AIO DIO io has not
completed, while there are truncate or other non AIO DIO which happened
completed before the pending AIO DIO completed. What we do with
i_disksize?
Maybe I just split the patches, only deals with holes and preallocation,
with doesn't need to update i_disksize... and worry about fixing DIO
write to the end of file for AIO later.
Mingming
On Tue 25-08-09 17:49:08, Mingming wrote:
> On Tue, 2009-08-25 at 12:41 -0700, Mingming wrote:
> > On Tue, 2009-08-25 at 16:01 +0200, Jan Kara wrote:
> > > On Mon 24-08-09 14:38:13, Mingming wrote:
> > > > On Thu, 2009-08-20 at 13:52 +0200, Jan Kara wrote:
> > > > > On Wed 19-08-09 14:26:16, Mingming wrote:
> > > > > > On Wed, 2009-08-19 at 16:15 +0200, Jan Kara wrote:
> > > > > > > > For direct IO write to the end of file, we now also could get rid of
> > > > > > > > using orphan list to protect expose stale data out after crash, when
> > > > > > > > direct write to end of file isn't complete. We now fallocate blocks for
> > > > > > > > the direct IO write to the end of file as well, and convert those
> > > > > > > > fallocated blocks at the end of file to written when IO is complete. If
> > > > > > > > fs crashed before the IO complete, it will only seen the file tail has
> > > > > > > > been fallocated but won't get the stale data out.
> > > > > > > But you still probably need orphan list to truncate blocks allocated
> > > > > > > beyond file end during extending direct write. So I'd just remove this
> > > > > > > paragraph...
> > > > > > >
> > > > > >
> > > > > > Do we still need to truncate blocks allocated at the end of file? Those
> > > > > > blocks are marked as uninitialized extents (as any block allocation from
> > > > > > DIO are flagged as uninitialized extents, before IO is complete), so
> > > > > > that after recover from crash, the stale data won't get exposed.
> > > > > >
> > > > > > I guess I missed the cases you concerned that we need the orphan list to
> > > > > > protect, could you plain a little more?
> > > > > Well, you are right it's not a security problem since no data gets
> > > > > exposed. It's just that after a crash, there will be blocks allocated to
> > > > > a file and it's not trivial to it find out unless you specifically stat the
> > > > > file. So it's just *not nice* kind of thing. If it brings us some
> > > > > advantage to not put the inode on the orphan list, then sure it might be
> > > > > a tradeoff we are willing to make. But otherwise I see no point.
> > > > >
> > > >
> > > > Hmm... I see what you concerned now...user probably don't like allocated
> > > > blocks at the end...
> > > Yes, it's kind of counterintuitive that blocks can get allocated but
> > > don't really "show up" in the file (because they are unwritten and beyond
> > > i_size).
> > >
> > > > I am trying to think of the ext3 case. In ext3 case, inode will be
> > > > removed from orphan list once the IO is complete, but that is before the
> > > > i_disksize get updated and journal handle being closed. What if the
> > > > system crash after inode removed from orphan list but before the
> > > > i_disksize get updated?
> > > As you write below, until we stop the handle, the transaction cannot be
> > > committed and so no change actually gets written to disk.
> > >
> >
> > Then in case this, if no change actually gets written to disk, then
> > i_disksize hasn't get updated on disk, why we need the orphan list?
> >
>
> Never mind, I missed the fact that i_size was updated in ext3_direct_IO
> before submit the IO, so orphan list is there to truncate i_size to
> i_disksize in case of crash.
I think we still don't understand ourselves. We don't care to much about
i_size being updated because we never write it to disk. We write only
i_disksize and that gets updated in the end. The problem is:
create(f)
direct_write(f, buf, 65536)
-> allocates 32 KB beyond EOF and then crash happens
If 'f' was not on the orphan list you'd see:
ls -l
-rw-r--r-- 1 jack users 0 2008-06-26 11:01 f
but the file has actually those 32 KB allocated and user has no way of
knowing about that.
Because 'f' *is* on orphan list, we free those 32 KB beyond EOF and
everything is happy.
That's the whole point I was trying to make.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
> > > We do update i_size in generic_file_direct_write(), there I think proper
> > > locking (i_mutex lock) is hold.
> > >
> > > For i_disksize update, with this patch, we call ext4_update_i_disksize()
> > > from ext4_convert_unwritten_extents(), at the end of IO is completed.
> > > ext4_update_i_disksize() grab the i_data_sem to prevent race with
> > > truncate.
> > >
> > > Now I think we are fine with race with truncate... no?
> > I don't think we are fine - i_disksize gets updated without i_data_sem
> > being held in ext4_setattr() so you can race with it. I'm not sure how
> > exactly locking rules for ext4 look like wrt. i_disksize update but either
> > your code or ext4_setattr() need to be changed.
>
> then ext4_setattr() race with other places using
> ext4_update_i_disksize() (e.g. fallocate) already. we need to fix
> ext4_setattr and probably other places where i_disksize is updated
> without holding i_data_sem.
OK.
> > Actually, probably you should hold i_mutex until the io is finished -
> > otherwise a truncate against the file can be started before the io is
> > finished and actually nothing prevents it from finishing before your end_io
> > callback gets processed. Then the extent conversion will get confused I
> > expect because it won't find extents it should convert.
> >
> DIO already did that:) i_mutex is hold during the whole direct IO,
> before return back to user, in the non AIO case.
Yes, the non-AIO case is fine. But the AIO case is problematic - and it's
not just about i_disksize update. Also the conversion from unwritten to
written extents could have problems with the file being truncated or
otherwise modified in the mean time.
> > > > > > It needn't be cached in the memory anymore!
> > > > >
> > > > > Right, we probably need to increase the reference to the inode before
> > > > > submit the IO, so the inode would not be push out of cache before IO
> > > > > completed.
> > > > Yes, that's possible but then you have to be prepared to handle deletes
> > > > of an inode from your worker thread because it can drop the last reference
> > > > to an already deleted inode.
> > > >
> > > > > > Also fsync() has to flush all the updates for the inode it has in the
> > > > > > workqueue... Ditto for ext4_sync_fs().
> > > > > >
> > > > >
> > > > > I think we discussed about this before, and it seems there is no clear
> > > > > defination the DIO forces metadata update sync to disk before returns
> > > > > back to user apps... If we do force this in ext4 &DIO, doing fsync() on
> > > > > every DIO call is expensive.
> > > > No, I meant something else:
> > > > fd = open("file", O_DIRECT | O_RDWR);
> > > > write(fd, buf, size);
> > > > fsync(fd)
> > > >
> > > > After fsync(fd) we *must* have everything on disk and reachable even if
> > > > we crashed just after fsync(). So conversion of extents from uninitialized
> > > > to initialized must happen during fsync() and it does not so far because we
> > > > have no connection from an inode to the work queued to the worker thread.
> > > >
> > >
> > > Are you worried about AIO DIO case?
> > >
> > > Because for non AIO case, when DIO returns back to user, the data IO has
> > > already completed, and the conversion of extents from uninitialized to
> > > initialized has already being kicked by flush_workqueue(). Since the
> > > conversion is being journaled, the aftweward fsync() should able to
> > > commit all transactions, thus force the conversion to be complete on
> > > disk after fsync.
> > Yes, you are right, this case is fine.
> >
> > > For the AIO DIO case, yeah I agree there might be a issue.... Hmm.. if
> > > we do fsync() after AIO DIO, currently I couldn't see any way fsync()
> > > could ensure all the pending data IOs from AIO DIO path would reach to
> > > disk...
> > >
> > > if there is a way fsycn() could wait for all pending data IOs from AIO
> > > DIO to complete, then end_io callback would able to trigger the
> > > conversion, and fsycn() would able to flush the metedata update hit to
> > > disk.
> > Well, thinking about this again, fsync() does not have to wait for
> > pending AIO - we never guaranteed anything about what fsync() does to such
> > requests. But once you complete the AIO (and thus userspace can know that
> > it is done), you have to make sure that fsync() flushes all the work
> > queued in the workqueue connected with this AIO.
> >
>
> I just realize, why don't we flush the work in the workqueue with this
> AIO at the IO completion time (but not ext4_direct_IO time)? Since the
> work just does conversion but not the full journal commit. That would
> work if there is fsync() after the AIO is completed.
The problem is that IO completion for AIO happens in interrupt and you
cannot sleep there. If you could, you would not need any workqueue...
> > > > So I'd
> > > > find simpler to do the conversion from ext4_direct_IO and just make sure the
> > > > transaction is not committed before the IO is done (essentially what we
> > > > currently do with ordered buffers).
> > > >
> > >
> > > True, that might workable....I am concerned this won't work for no
> > > journal mode, though this mode is not commonly used now.:)
> > Well, if you are running without a journal, you can see stale data after
> > a crash - there's no way around it. So just not waiting for anything works
> > well.
> >
>
> But in the direct IO case, with the end_io call back, we won't update
> i_disksize until IO is completed, you should not see stale data after a
> crash, even without journal, am I miss anything?
Yes, that is probably right (unless disk reorders writes). But my point
was that once you run without a journal, we don't care whether we expose
stale data or not.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
Update since V2
1) reverse the two patches order
2) drop the code to use end_io to handle direct write to end of file too. Just
keep this patch handle fallocate and holes. Keep the orphan list for write to end of file.
3) survived fsstress test and fsx test.
I may missed some review comments from last time, but I need to send this out for review first.
Thanks,
Mingming
-----------------
ext4: Direct IO for holes and fallocate: unwritten extents spt for DIO
From: Mingming <[email protected]>
When writing into an unitialized extent via direct I/O, and the direct
I/O doesn't exactly cover the unitialized extent, split the extent
into uninitialized and initialized extents before submitting the I/O.
The reason for doing this is to avoid needing to deal with an ENOSPC
error in the end_io callback that gets used for direct I/O.
When the IO is complete, the written extent will be marked as initialized.
Singed-Off-By: Mingming Cao <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4.h | 7
fs/ext4/ext4_extents.h | 7
fs/ext4/extents.c | 421 ++++++++++++++++++++++++++++++++++++++++++++-----
fs/ext4/inode.c | 3
fs/ext4/migrate.c | 2
fs/ext4/move_extent.c | 4
6 files changed, 403 insertions(+), 41 deletions(-)
Index: linux-2.6.31-rc4/fs/ext4/ext4_extents.h
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/ext4_extents.h
+++ linux-2.6.31-rc4/fs/ext4/ext4_extents.h
@@ -219,6 +219,11 @@ static inline int ext4_ext_get_actual_le
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}
+static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+{
+ ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+}
+
extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -234,7 +239,7 @@ extern int ext4_ext_try_to_merge(struct
struct ext4_ext_path *path,
struct ext4_extent *);
extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
ext_prepare_callback, void *);
extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
Index: linux-2.6.31-rc4/fs/ext4/extents.c
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/extents.c
+++ linux-2.6.31-rc4/fs/ext4/extents.c
@@ -706,7 +706,7 @@ err:
* insert new index [@logical;@ptr] into the block at @curp;
* check where to insert: before @curp or after @curp
*/
-static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
struct ext4_ext_path *curp,
int logical, ext4_fsblk_t ptr)
{
@@ -1569,7 +1569,7 @@ out:
*/
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ struct ext4_extent *newext, int flag)
{
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
@@ -1585,7 +1585,8 @@ int ext4_ext_insert_extent(handle_t *han
BUG_ON(path[depth].p_hdr == NULL);
/* try to insert block into found extent and return */
- if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
+ if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ && ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
@@ -1705,7 +1706,8 @@ has_space:
merge:
/* try to merge extents to the right */
- ext4_ext_try_to_merge(inode, path, nearex);
+ if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ ext4_ext_try_to_merge(inode, path, nearex);
/* try to merge extents to the left */
@@ -2473,7 +2475,6 @@ static int ext4_ext_zeroout(struct inode
}
#define EXT4_EXT_ZERO_LEN 7
-
/*
* This function is called by ext4_ext_get_blocks() if someone tries to write
* to an uninitialized extent. It may result in splitting the uninitialized
@@ -2566,7 +2567,8 @@ static int ext4_ext_convert_to_initializ
ex3->ee_block = cpu_to_le32(iblock);
ext4_ext_store_pblock(ex3, newblock);
ex3->ee_len = cpu_to_le16(allocated);
- err = ext4_ext_insert_extent(handle, inode, path, ex3);
+ err = ext4_ext_insert_extent(handle, inode, path,
+ ex3, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2622,7 +2624,7 @@ static int ext4_ext_convert_to_initializ
ext4_ext_store_pblock(ex3, newblock + max_blocks);
ex3->ee_len = cpu_to_le16(allocated - max_blocks);
ext4_ext_mark_uninitialized(ex3);
- err = ext4_ext_insert_extent(handle, inode, path, ex3);
+ err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2740,7 +2742,7 @@ static int ext4_ext_convert_to_initializ
err = ext4_ext_dirty(handle, inode, path + depth);
goto out;
insert:
- err = ext4_ext_insert_extent(handle, inode, path, &newex);
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2768,6 +2770,320 @@ fix_extent_len:
}
/*
+ * This function is called by ext4_ext_get_blocks() from
+ * ext4_get_blocks_dio_write() when DIO to write
+ * to an uninitialized extent.
+ *
+ * Writing to an uninitized extent may result in splitting the uninitialized
+ * extent into multiple /intialized unintialized extents (up to three)
+ * There are three possibilities:
+ * a> There is no split required: Entire extent should be uninitialized
+ * b> Splits in two extents: Write is happening at either end of the extent
+ * c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * One of more index blocks maybe needed if the extent tree grow after
+ * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the uninitialized extent before DIO submit
+ * the IO. The uninitilized extent called at this time will be split
+ * into three uninitialized extent(at most). After IO complete, the part
+ * being filled will be convert to initialized by the end_io callback function
+ * via ext4_convert_unwritten_extents().
+ */
+static int ext4_split_unwritten_extents(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t iblock,
+ unsigned int max_blocks,
+ int flags)
+{
+ struct ext4_extent *ex, newex, orig_ex;
+ struct ext4_extent *ex1 = NULL;
+ struct ext4_extent *ex2 = NULL;
+ struct ext4_extent *ex3 = NULL;
+ struct ext4_extent_header *eh;
+ ext4_lblk_t ee_block;
+ unsigned int allocated, ee_len, depth;
+ ext4_fsblk_t newblock;
+ int err = 0;
+ int ret = 0;
+
+ ext_debug("ext4_split_unwritten_extents: inode %lu,"
+ "iblock %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)iblock, max_blocks);
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ allocated = ee_len - (iblock - ee_block);
+ newblock = iblock - ee_block + ext_pblock(ex);
+ ex2 = ex;
+ orig_ex.ee_block = ex->ee_block;
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+
+ /*
+ * if the entire unintialized extent length less than
+ * the size of extent to write, there is no need to split
+ * uninitialized extent
+ */
+ if (allocated <= max_blocks)
+ return ret;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* ex1: ee_block to iblock - 1 : uninitialized */
+ if (iblock > ee_block) {
+ ex1 = ex;
+ ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ext4_ext_mark_uninitialized(ex1);
+ ex2 = &newex;
+ }
+ /*
+ * for sanity, update the length of the ex2 extent before
+ * we insert ex3, if ex1 is NULL. This is to avoid temporary
+ * overlap of blocks.
+ */
+ if (!ex1 && allocated > max_blocks)
+ ex2->ee_len = cpu_to_le16(max_blocks);
+ /* ex3: to ee_block + ee_len : uninitialised */
+ if (allocated > max_blocks) {
+ unsigned int newdepth;
+ ex3 = &newex;
+ ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+ ext4_ext_store_pblock(ex3, newblock + max_blocks);
+ ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+ ext4_ext_mark_uninitialized(ex3);
+ err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
+ if (err == -ENOSPC) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_dirty(handle, inode, path + depth);
+ /* zeroed the full extent */
+ /* blocks available from iblock */
+ return allocated;
+
+ } else if (err)
+ goto fix_extent_len;
+ /*
+ * The depth, and hence eh & ex might change
+ * as part of the insert above.
+ */
+ newdepth = ext_depth(inode);
+ /*
+ * update the extent length after successful insert of the
+ * split extent
+ */
+ orig_ex.ee_len = cpu_to_le16(ee_len -
+ ext4_ext_get_actual_len(ex3));
+ depth = newdepth;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, iblock, path);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
+ }
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ if (ex2 != &newex)
+ ex2 = ex;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ allocated = max_blocks;
+ }
+ /*
+ * If there was a change of depth as part of the
+ * insertion of ex3 above, we need to update the length
+ * of the ex1 extent again here
+ */
+ if (ex1 && ex1 != ex) {
+ ex1 = ex;
+ ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ext4_ext_mark_uninitialized(ex1);
+ ex2 = &newex;
+ }
+ /*
+ * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
+ * uninitialised still.
+ */
+ ex2->ee_block = cpu_to_le32(iblock);
+ ext4_ext_store_pblock(ex2, newblock);
+ ex2->ee_len = cpu_to_le16(allocated);
+ ext4_ext_mark_uninitialized(ex2);
+ if (ex2 != ex)
+ goto insert;
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ ext_debug("out here\n");
+ goto out;
+insert:
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (err == -ENOSPC) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_dirty(handle, inode, path + depth);
+ /* zero out the first half */
+ return allocated;
+ } else if (err)
+ goto fix_extent_len;
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err ? err : allocated;
+
+fix_extent_len:
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_mark_uninitialized(ex);
+ ext4_ext_dirty(handle, inode, path + depth);
+ return err;
+}
+static int ext4_convert_unwritten_extents_dio(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent *ex;
+ struct ext4_extent_header *eh;
+ int depth;
+ int err = 0;
+ int ret = 0;
+
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* first mark the extent as initialized */
+ ext4_ext_mark_initialized(ex);
+
+ /*
+ * We have to see if it can be merged with the extent
+ * on the left.
+ */
+ if (ex > EXT_FIRST_EXTENT(eh)) {
+ /*
+ * To merge left, pass "ex - 1" to try_to_merge(),
+ * since it merges towards right _only_.
+ */
+ ret = ext4_ext_try_to_merge(inode, path, ex - 1);
+ if (ret) {
+ err = ext4_ext_correct_indexes(handle, inode, path);
+ if (err)
+ goto out;
+ depth = ext_depth(inode);
+ ex--;
+ }
+ }
+ /*
+ * Try to Merge towards right.
+ */
+ ret = ext4_ext_try_to_merge(inode, path, ex);
+ if (ret) {
+ err = ext4_ext_correct_indexes(handle, inode, path);
+ if (err)
+ goto out;
+ depth = ext_depth(inode);
+ }
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+}
+
+static int
+ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, unsigned int max_blocks,
+ struct ext4_ext_path *path, int flags,
+ unsigned int allocated, struct buffer_head *bh_result,
+ ext4_fsblk_t newblock)
+{
+ int ret = 0;
+ int err = 0;
+
+ ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
+ "block %llu, max_blocks %u, flags %d, allocated %u",
+ inode->i_ino, (unsigned long long)iblock, max_blocks,
+ flags, allocated);
+ ext4_ext_show_leaf(inode, path);
+
+ /* DIO get_block() before submit the IO, split the extent */
+ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+ ret = ext4_split_unwritten_extents(handle,
+ inode, path, iblock,
+ max_blocks, flags);
+ goto out;
+ }
+ /* DIO end_io complete, convert the filled extent to written */
+ if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
+ ret = ext4_convert_unwritten_extents_dio(handle, inode,
+ path);
+ goto out2;
+ }
+ /* buffered IO case */
+ /*
+ * repeat fallocate creation request
+ * we already have an unwritten extent
+ */
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+ goto map_out;
+
+ /* buffered READ or buffered write_begin() lookup */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ /*
+ * We have blocks reserved already. We
+ * return allocated blocks so that delalloc
+ * won't do block reservation for us. But
+ * the buffer head will be unmapped so that
+ * a read from the block returns 0s.
+ */
+ set_buffer_unwritten(bh_result);
+ goto out1;
+ }
+
+ /* buffered write, writepage time, convert*/
+ ret = ext4_ext_convert_to_initialized(handle, inode,
+ path, iblock,
+ max_blocks);
+out:
+ if (ret <= 0) {
+ err = ret;
+ goto out2;
+ } else
+ allocated = ret;
+ set_buffer_new(bh_result);
+map_out:
+ set_buffer_mapped(bh_result);
+out1:
+ if (allocated > max_blocks)
+ allocated = max_blocks;
+ ext4_ext_show_leaf(inode, path);
+ bh_result->b_bdev = inode->i_sb->s_bdev;
+ bh_result->b_blocknr = newblock;
+out2:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ return err ? err : allocated;
+}
+/*
* Block allocation/map/preallocation routine for extents based files
*
*
@@ -2872,33 +3188,10 @@ int ext4_ext_get_blocks(handle_t *handle
EXT4_EXT_CACHE_EXTENT);
goto out;
}
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
- goto out;
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
- if (allocated > max_blocks)
- allocated = max_blocks;
- /*
- * We have blocks reserved already. We
- * return allocated blocks so that delalloc
- * won't do block reservation for us. But
- * the buffer head will be unmapped so that
- * a read from the block returns 0s.
- */
- set_buffer_unwritten(bh_result);
- bh_result->b_bdev = inode->i_sb->s_bdev;
- bh_result->b_blocknr = newblock;
- goto out2;
- }
-
- ret = ext4_ext_convert_to_initialized(handle, inode,
- path, iblock,
- max_blocks);
- if (ret <= 0) {
- err = ret;
- goto out2;
- } else
- allocated = ret;
- goto outnew;
+ ret = ext4_ext_handle_uninitialized_extents(handle,
+ inode, iblock, max_blocks, path,
+ flags, allocated, bh_result, newblock);
+ return ret;
}
}
@@ -2971,7 +3264,7 @@ int ext4_ext_get_blocks(handle_t *handle
newex.ee_len = cpu_to_le16(ar.len);
if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
ext4_ext_mark_uninitialized(&newex);
- err = ext4_ext_insert_extent(handle, inode, path, &newex);
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
/* free data blocks we just allocated */
/* not a good idea to call discard here directly,
@@ -2985,7 +3278,6 @@ int ext4_ext_get_blocks(handle_t *handle
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
-outnew:
set_buffer_new(bh_result);
/* Cache only when it is _not_ an uninitialized extent */
@@ -3184,6 +3476,63 @@ retry:
}
/*
+ * This function convert a range of blocks to written extents
+ * The caller of this function will pass the start offset and the size.
+ * all unwritten extents within this range will be converted to
+ * written extents.
+ *
+ * This function is called from the direct IO end io call back
+ * function, to convert the fallocated extents after IO is completed.
+ */
+int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ loff_t len)
+{
+ handle_t *handle;
+ ext4_lblk_t block;
+ unsigned int max_blocks;
+ int ret = 0;
+ int ret2 = 0;
+ struct buffer_head map_bh;
+ unsigned int credits, blkbits = inode->i_blkbits;
+
+ block = offset >> blkbits;
+ /*
+ * We can't just convert len to max_blocks because
+ * If blocksize = 4096 offset = 3072 and len = 2048
+ */
+ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+ - block;
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ while (ret >= 0 && ret < max_blocks) {
+ block = block + ret;
+ max_blocks = max_blocks - ret;
+ handle = ext4_journal_start(inode, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
+ map_bh.b_state = 0;
+ ret = ext4_get_blocks(handle, inode, block,
+ max_blocks, &map_bh,
+ EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
+ if (ret <= 0) {
+ WARN_ON(ret <= 0);
+ printk(KERN_ERR "%s: ext4_ext_get_blocks "
+ "returned error inode#%lu, block=%u, "
+ "max_blocks=%u", __func__,
+ inode->i_ino, block, max_blocks);
+ }
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ if (ret <= 0 || ret2 )
+ break;
+ }
+ return ret > 0 ? ret2 : ret;
+}
+/*
* Callback function called for each extent to gather FIEMAP information.
*/
static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
Index: linux-2.6.31-rc4/fs/ext4/inode.c
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/inode.c
+++ linux-2.6.31-rc4/fs/ext4/inode.c
@@ -1155,6 +1155,9 @@ int ext4_get_blocks(handle_t *handle, st
clear_buffer_mapped(bh);
clear_buffer_unwritten(bh);
+ ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, flags, max_blocks,
+ (unsigned long)block);
/*
* Try to see if we can get the block without requesting a new
* file system block.
Index: linux-2.6.31-rc4/fs/ext4/migrate.c
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/migrate.c
+++ linux-2.6.31-rc4/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle
goto err_out;
}
}
- retval = ext4_ext_insert_extent(handle, inode, path, &newext);
+ retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
err_out:
if (path) {
ext4_ext_drop_refs(path);
Index: linux-2.6.31-rc4/fs/ext4/move_extent.c
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/move_extent.c
+++ linux-2.6.31-rc4/fs/ext4/move_extent.c
@@ -288,7 +288,7 @@ mext_insert_across_blocks(handle_t *hand
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
- orig_path, new_ext))
+ orig_path, new_ext, 0))
goto out;
}
@@ -299,7 +299,7 @@ mext_insert_across_blocks(handle_t *hand
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
- orig_path, end_ext))
+ orig_path, end_ext, 0))
goto out;
}
out:
Index: linux-2.6.31-rc4/fs/ext4/ext4.h
===================================================================
--- linux-2.6.31-rc4.orig/fs/ext4/ext4.h
+++ linux-2.6.31-rc4/fs/ext4/ext4.h
@@ -111,6 +111,15 @@ struct ext4_allocation_request {
unsigned int flags;
};
+typedef struct ext4_io_end{
+ struct inode *inode; /* file being written to */
+ unsigned int flag; /* sync IO or AIO */
+ int error; /* I/O error code */
+ ext4_lblk_t offset; /* offset in the file */
+ size_t size; /* size of the extent */
+ struct work_struct work; /* data work queue */
+}ext4_io_end_t;
+
/*
* Special inodes numbers
*/
@@ -330,7 +339,16 @@ struct ext4_new_group_data {
/* Call ext4_da_update_reserve_space() after successfully
allocating the blocks */
#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008