Hi list,
Currently, in ext4, write dio is serialized because i_mutex is locked in
generic_file_aio_write. But, when we overwrite some data without changing
metadata, these dios can be parallelized. So this patch set aims to make
overwrite dio paralleled.
When we overwrite some data, the metadata of this file doesn't need to be
modified. Thus, we can try to lock i_data_sem directly to synchronized all
dio write operations. First of all, a new wrapper function is defined instead
of genereic_file_aio_write in order to avoid to lock i_mutex. Then we need to
define a new get_block function and a new flag for dio overwrite nolock feature
so that we can avoid nested lock and deadlock. In ext4_map_blocks, i_data_sem
is acquired to do a lookup. But after adding this new feature, this lock will
be acquired in high level. Obviouslyi, here is a nested lock and we need to
avoid it. Now, in ext4, we always start a new journal firstly, and then try to
acquire i_data_sem. When we do a overwrite dio, journal doesn't need to be
created in order to avoid a deadlock.
In new wrapper function, called ext4_file_dio_write, it checks whether
conditions are satisfied or not. If these are met, we lock i_data_sem directly
and parallelize all write operations.
In first patch, two functions are defined in order to split into buffered IO and
direct IO because we can keep buffered IO that still uses vfs path, and add new
feature into dio path.
In second patch, we add a new flag and a new function for get_block. This
get_block function only does a lookup without any locks.
In last patch, dio overwrite nolock is added. This feature also need to use
dioread_nolock option. When a filesystem is mounted with dioread_nolock, this
feature is enabled.
I have run some benchmarks in my desktop to test this feature. In my desktop,
it has a Intel(R) Core(TM)2 Duo CPU E8400 @ 3.00GHz, 4G memory and a Intel X-25
160G SSD. I use fio to run my benchmarks and I compare dio overwrite nolock
with w/o dioread_nolock and w/ dioread_nolock.
= case 1 =
== config file ==
[global]
ioengine=psync
direct=1
bs=4k
size=32G
runtime=60
directory=/mnt/ext4/
filename=testfile
group_reporting
thread
[file1]
numjobs=1 # 4 8 16
rw=randwrite
== result (iops) ==
write 1 4 8 16
lock 7233 8612 9102 9165
dioread_nolock 8217 8228 8673 8755
diooverwrite_nolock 7740 15446 14563 17749
= case 2 =
== config file ==
[global]
ioengine=sync
direct=1
bs=4k
size=32G
runtime=60
directory=/mnt/ext4/
filename=testfile
group_reporting
thread
[file1]
numjobs=1 # 2 4 8
rw=randread
[file2]
numjobs=1 # 2 4 8
rw=randwrite
== result (iops) ==
read/write 2 4 8 16
lock 614/4343 1346/3124 1271/3930 1386/3904
dioread_nolock 1040/1963 2162/1243 3980/1479 13716/924
diooverwrite_nolock 1006/1913 1973/2602 3683/4515 6966/7260
Regards,
Zheng
Zheng Liu (3):
ext4: split ext4_file_write into buffered IO and direct IO
ext4: add a new flag for ext4_map_blocks
ext4: add dio overwrite nolock
fs/ext4/ext4.h | 2 +
fs/ext4/file.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++------
fs/ext4/inode.c | 46 ++++++++++---
3 files changed, 215 insertions(+), 33 deletions(-)
From: Zheng Liu <[email protected]>
ext4_file_buffered/direct_write are defined in order to split buffered IO and
direct IO in ext4. This patch just refactor some stuff in write path.
Signed-off-by: Zheng Liu <[email protected]>
---
fs/ext4/file.c | 66 +++++++++++++++++++++++++++++++++++++------------------
1 files changed, 44 insertions(+), 22 deletions(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index cb70f18..e5d6be3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -89,12 +89,51 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
return 0;
}
+static inline ssize_t
+ext4_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ return generic_file_aio_write(iocb, iov, nr_segs, pos);
+}
+
+static ssize_t
+ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ int unaligned_aio = 0;
+ ssize_t ret;
+
+ if (!is_sync_kiocb(iocb))
+ unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
+
+ /* Unaligned direct AIO must be serialized; see comment above */
+ if (unaligned_aio) {
+ static unsigned long unaligned_warn_time;
+
+ /* Warn about this once per day */
+ if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
+ ext4_msg(inode->i_sb, KERN_WARNING,
+ "Unaligned AIO/DIO on inode %ld by %s; "
+ "performance will be poor.",
+ inode->i_ino, current->comm);
+ mutex_lock(ext4_aio_mutex(inode));
+ ext4_aiodio_wait(inode);
+ }
+
+ ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+ if (unaligned_aio)
+ mutex_unlock(ext4_aio_mutex(inode));
+
+ return ret;
+}
+
static ssize_t
ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
- int unaligned_aio = 0;
int ret;
/*
@@ -114,29 +153,12 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
sbi->s_bitmap_maxbytes - pos);
}
- } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
- !is_sync_kiocb(iocb))) {
- unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
- }
-
- /* Unaligned direct AIO must be serialized; see comment above */
- if (unaligned_aio) {
- static unsigned long unaligned_warn_time;
-
- /* Warn about this once per day */
- if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
- ext4_msg(inode->i_sb, KERN_WARNING,
- "Unaligned AIO/DIO on inode %ld by %s; "
- "performance will be poor.",
- inode->i_ino, current->comm);
- mutex_lock(ext4_aio_mutex(inode));
- ext4_aiodio_wait(inode);
}
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
From: Zheng Liu <[email protected]>
Aligned and overwrite direct IO can be parallelized. In ext4_file_dio_write,
we first check whether these conditions are satisfied or not. If so, we unlock
the i_mutex and acquire i_data_sem directly. Meanwhile iocb->private is set to
indicate that this is a overwrite dio, and it will be processed in
ext4_ext_direct_IO.
Signed-off-by: Zheng Liu <[email protected]>
---
fs/ext4/file.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 137 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e5d6be3..8a5f713 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -100,9 +100,21 @@ static ssize_t
ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
- int unaligned_aio = 0;
+ struct file *file = iocb->ki_filp;
+ struct address_space * mapping = file->f_mapping;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct blk_plug plug;
ssize_t ret;
+ ssize_t written, written_buffered;
+ size_t length = iov_length(iov, nr_segs);
+ size_t ocount; /* original count */
+ size_t count; /* after file limit checks */
+ int unaligned_aio = 0;
+ int overwrite = 0;
+ loff_t *ppos = &iocb->ki_pos;
+ loff_t endbyte;
+
+ BUG_ON(iocb->ki_pos != pos);
if (!is_sync_kiocb(iocb))
unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
@@ -121,7 +133,129 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
ext4_aiodio_wait(inode);
}
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ mutex_lock(&inode->i_mutex);
+ blk_start_plug(&plug);
+
+ ocount = 0;
+ ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+ if (ret)
+ goto unlock_out;
+
+ count = ocount;
+ pos = *ppos;
+
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+ written = 0;
+
+ ret = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (ret)
+ goto out;
+
+ if (count == 0)
+ goto out;
+
+ ret = file_remove_suid(file);
+ if (ret)
+ goto out;
+
+ file_update_time(file);
+
+ iocb->private = NULL;
+
+ if (!unaligned_aio && !file->f_mapping->nrpages &&
+ pos + length < i_size_read(inode) &&
+ ext4_should_dioread_nolock(inode)) {
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ int err;
+ int len;
+
+ map.m_lblk = pos >> blkbits;
+ map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
+ - map.m_lblk;
+ len = map.m_len;
+
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err == len && (!map.m_flags ||
+ map.m_flags & EXT4_MAP_MAPPED)) {
+ overwrite = 1;
+ iocb->private = &overwrite;
+ mutex_unlock(&inode->i_mutex);
+ down_read(&EXT4_I(inode)->i_data_sem);
+ }
+ }
+
+ if (file->f_mapping->nrpages && overwrite) {
+ overwrite = 0;
+ up_read(&EXT4_I(inode)->i_data_sem);
+ mutex_lock(&inode->i_mutex);
+ }
+
+ written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
+ ppos, count, ocount);
+ if (written < 0 || written == count)
+ goto out;
+ /*
+ * direct-io write to a hole: fall through to buffered I/O
+ * for completing the rest of the request.
+ */
+ pos += written;
+ count -= written;
+ written_buffered = generic_file_buffered_write(iocb, iov,
+ nr_segs, pos, ppos, count,
+ written);
+ /*
+ * If generic_file_buffered_write() retuned a synchronous error
+ * then we want to return the number of bytes which were
+ * direct-written, or the error code if that was zero. Note
+ * that this differs from normal direct-io semantics, which
+ * will return -EFOO even if some bytes were written.
+ */
+ if (written_buffered < 0) {
+ ret = written_buffered;
+ goto out;
+ }
+
+ /*
+ * We need to ensure that the page cache pages are written to
+ * disk and invalidated to preserve the expected O_DIRECT
+ * semantics.
+ */
+ endbyte = pos + written_buffered - written - 1;
+ ret = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+ if (ret == 0) {
+ written = written_buffered;
+ invalidate_mapping_pages(mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
+ } else {
+ /*
+ * We don't know how much we wrote, so just return
+ * the number of bytes which were direct-written
+ */
+ }
+
+out:
+ current->backing_dev_info = NULL;
+ ret = written ? written : ret;
+
+unlock_out:
+ if (overwrite)
+ up_read(&EXT4_I(inode)->i_data_sem);
+ else
+ mutex_unlock(&inode->i_mutex);
+
+ if (ret > 0 || ret == -EIOCBQUEUED) {
+ ssize_t err;
+
+ err = generic_write_sync(file, pos, ret);
+ if (err < 0 && ret > 0)
+ ret = err;
+ }
+ blk_finish_plug(&plug);
if (unaligned_aio)
mutex_unlock(ext4_aio_mutex(inode));
--
1.7.1
From: Zheng Liu <[email protected]>
EXT4_GET_BLOCKS_NO_LOCK flag is added to indicate that we don't need to acquire
i_data_sem lock in ext4_map_blocks. Meanwhile, it lets _ext4_get_block do not
start a new journal because when we do a overwrite dio, there is no any
metadata that needs to be modified.
We define a new function called ext4_get_block_write_nolock, which is used in
dio overwrite nolock. In this function, it doesn't try to acquire i_data_sem
lock and doesn't start a new journal as it does a lookup.
Signed-off-by: Zheng Liu <[email protected]>
---
fs/ext4/ext4.h | 2 ++
fs/ext4/inode.c | 46 +++++++++++++++++++++++++++++++++++-----------
2 files changed, 37 insertions(+), 11 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0e01e90..748fda5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -558,6 +558,8 @@ enum {
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
/* Request will not result in inode size update (user for fallocate) */
#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
+ /* Do not acquire i_data_sem lock */
+#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
/*
* Flags used by ext4_free_blocks
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c77b0bd..c762d4f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -477,7 +477,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* Try to see if we can get the block without requesting a new
* file system block.
*/
- down_read((&EXT4_I(inode)->i_data_sem));
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read((&EXT4_I(inode)->i_data_sem));
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -485,7 +486,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
retval = ext4_ind_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
}
- up_read((&EXT4_I(inode)->i_data_sem));
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
int ret = check_block_validity(inode, map);
@@ -597,7 +599,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
- if (flags && !handle) {
+ if (flags && !handle && !(flags & EXT4_GET_BLOCKS_NO_LOCK)) {
/* Direct IO write... */
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
@@ -2751,6 +2753,15 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_IO_CREATE_EXT);
}
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_NO_LOCK);
+}
+
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private, int ret,
bool is_async)
@@ -2899,6 +2910,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
loff_t final_size = offset + count;
if (rw == WRITE && final_size <= inode->i_size) {
+ int overwrite = 0;
+
/*
* We could direct write to holes and fallocate.
*
@@ -2919,6 +2932,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* Here for async case, we allocate an io_end structure to
* hook to the iocb.
*/
+ if (iocb->private)
+ overwrite = *((int *)iocb->private);
iocb->private = NULL;
EXT4_I(inode)->cur_aio_dio = NULL;
if (!is_sync_kiocb(iocb)) {
@@ -2938,13 +2953,22 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
EXT4_I(inode)->cur_aio_dio = iocb->private;
}
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block_write,
- ext4_end_io_dio,
- NULL,
- DIO_LOCKING);
+ if (overwrite)
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_write_nolock,
+ ext4_end_io_dio,
+ NULL,
+ 0);
+ else
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_write,
+ ext4_end_io_dio,
+ NULL,
+ DIO_LOCKING);
if (iocb->private)
EXT4_I(inode)->cur_aio_dio = NULL;
/*
@@ -2964,7 +2988,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
ext4_free_io_end(iocb->private);
iocb->private = NULL;
- } else if (ret > 0 && ext4_test_inode_state(inode,
+ } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
int err;
/*
--
1.7.1
Hi zheng,
On 04/28/2012 11:39 AM, Zheng Liu wrote:
> From: Zheng Liu <[email protected]>
>
> ext4_file_buffered/direct_write are defined in order to split buffered IO and
> direct IO in ext4. This patch just refactor some stuff in write path.
>
> Signed-off-by: Zheng Liu <[email protected]>
> ---
> fs/ext4/file.c | 66 +++++++++++++++++++++++++++++++++++++------------------
> 1 files changed, 44 insertions(+), 22 deletions(-)
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index cb70f18..e5d6be3 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -89,12 +89,51 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
> return 0;
> }
>
> +static inline ssize_t
> +ext4_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
> + unsigned long nr_segs, loff_t pos)
> +{
> + return generic_file_aio_write(iocb, iov, nr_segs, pos);
> +}
any reason you wrap generic_file_aio_write with a new function? I didn't
see you use it in the following patch either.
Thanks
Tao
> +
> +static ssize_t
> +ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> + unsigned long nr_segs, loff_t pos)
> +{
> + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> + int unaligned_aio = 0;
> + ssize_t ret;
> +
> + if (!is_sync_kiocb(iocb))
> + unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> +
> + /* Unaligned direct AIO must be serialized; see comment above */
> + if (unaligned_aio) {
> + static unsigned long unaligned_warn_time;
> +
> + /* Warn about this once per day */
> + if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
> + ext4_msg(inode->i_sb, KERN_WARNING,
> + "Unaligned AIO/DIO on inode %ld by %s; "
> + "performance will be poor.",
> + inode->i_ino, current->comm);
> + mutex_lock(ext4_aio_mutex(inode));
> + ext4_aiodio_wait(inode);
> + }
> +
> + ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> +
> + if (unaligned_aio)
> + mutex_unlock(ext4_aio_mutex(inode));
> +
> + return ret;
> +}
> +
> static ssize_t
> ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
> unsigned long nr_segs, loff_t pos)
> {
> struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> - int unaligned_aio = 0;
> int ret;
>
> /*
> @@ -114,29 +153,12 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
> nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
> sbi->s_bitmap_maxbytes - pos);
> }
> - } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
> - !is_sync_kiocb(iocb))) {
> - unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> - }
> -
> - /* Unaligned direct AIO must be serialized; see comment above */
> - if (unaligned_aio) {
> - static unsigned long unaligned_warn_time;
> -
> - /* Warn about this once per day */
> - if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
> - ext4_msg(inode->i_sb, KERN_WARNING,
> - "Unaligned AIO/DIO on inode %ld by %s; "
> - "performance will be poor.",
> - inode->i_ino, current->comm);
> - mutex_lock(ext4_aio_mutex(inode));
> - ext4_aiodio_wait(inode);
> }
>
> - ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> -
> - if (unaligned_aio)
> - mutex_unlock(ext4_aio_mutex(inode));
> + if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))
> + ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);
> + else
> + ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
>
> return ret;
> }
On Wed, May 02, 2012 at 12:11:44PM +0800, Tao Ma wrote:
> Hi zheng,
> On 04/28/2012 11:39 AM, Zheng Liu wrote:
> > From: Zheng Liu <[email protected]>
> >
> > ext4_file_buffered/direct_write are defined in order to split buffered IO and
> > direct IO in ext4. This patch just refactor some stuff in write path.
> >
> > Signed-off-by: Zheng Liu <[email protected]>
> > ---
> > fs/ext4/file.c | 66 +++++++++++++++++++++++++++++++++++++------------------
> > 1 files changed, 44 insertions(+), 22 deletions(-)
> >
> > diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> > index cb70f18..e5d6be3 100644
> > --- a/fs/ext4/file.c
> > +++ b/fs/ext4/file.c
> > @@ -89,12 +89,51 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
> > return 0;
> > }
> >
> > +static inline ssize_t
> > +ext4_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
> > + unsigned long nr_segs, loff_t pos)
> > +{
> > + return generic_file_aio_write(iocb, iov, nr_segs, pos);
> > +}
> any reason you wrap generic_file_aio_write with a new function? I didn't
> see you use it in the following patch either.
Yes, I don't use it in the following patch. It is defined in order to
be consistent with ext4_file_dio_write and make things clearly.
Regards,
Zheng
>
> Thanks
> Tao
> > +
> > +static ssize_t
> > +ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> > + unsigned long nr_segs, loff_t pos)
> > +{
> > + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> > + int unaligned_aio = 0;
> > + ssize_t ret;
> > +
> > + if (!is_sync_kiocb(iocb))
> > + unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> > +
> > + /* Unaligned direct AIO must be serialized; see comment above */
> > + if (unaligned_aio) {
> > + static unsigned long unaligned_warn_time;
> > +
> > + /* Warn about this once per day */
> > + if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
> > + ext4_msg(inode->i_sb, KERN_WARNING,
> > + "Unaligned AIO/DIO on inode %ld by %s; "
> > + "performance will be poor.",
> > + inode->i_ino, current->comm);
> > + mutex_lock(ext4_aio_mutex(inode));
> > + ext4_aiodio_wait(inode);
> > + }
> > +
> > + ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> > +
> > + if (unaligned_aio)
> > + mutex_unlock(ext4_aio_mutex(inode));
> > +
> > + return ret;
> > +}
> > +
> > static ssize_t
> > ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
> > unsigned long nr_segs, loff_t pos)
> > {
> > struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> > - int unaligned_aio = 0;
> > int ret;
> >
> > /*
> > @@ -114,29 +153,12 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
> > nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
> > sbi->s_bitmap_maxbytes - pos);
> > }
> > - } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
> > - !is_sync_kiocb(iocb))) {
> > - unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> > - }
> > -
> > - /* Unaligned direct AIO must be serialized; see comment above */
> > - if (unaligned_aio) {
> > - static unsigned long unaligned_warn_time;
> > -
> > - /* Warn about this once per day */
> > - if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
> > - ext4_msg(inode->i_sb, KERN_WARNING,
> > - "Unaligned AIO/DIO on inode %ld by %s; "
> > - "performance will be poor.",
> > - inode->i_ino, current->comm);
> > - mutex_lock(ext4_aio_mutex(inode));
> > - ext4_aiodio_wait(inode);
> > }
> >
> > - ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> > -
> > - if (unaligned_aio)
> > - mutex_unlock(ext4_aio_mutex(inode));
> > + if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))
> > + ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);
> > + else
> > + ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> >
> > return ret;
> > }
>
On 04/28/2012 11:39 AM, Zheng Liu wrote:
> From: Zheng Liu <[email protected]>
>
> Aligned and overwrite direct IO can be parallelized. In ext4_file_dio_write,
> we first check whether these conditions are satisfied or not. If so, we unlock
> the i_mutex and acquire i_data_sem directly. Meanwhile iocb->private is set to
> indicate that this is a overwrite dio, and it will be processed in
> ext4_ext_direct_IO.
>
> Signed-off-by: Zheng Liu <[email protected]>
> ---
> fs/ext4/file.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 files changed, 137 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index e5d6be3..8a5f713 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -100,9 +100,21 @@ static ssize_t
> ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> unsigned long nr_segs, loff_t pos)
> {
> - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> - int unaligned_aio = 0;
> + struct file *file = iocb->ki_filp;
> + struct address_space * mapping = file->f_mapping;
> + struct inode *inode = file->f_path.dentry->d_inode;
> + struct blk_plug plug;
> ssize_t ret;
> + ssize_t written, written_buffered;
> + size_t length = iov_length(iov, nr_segs);
> + size_t ocount; /* original count */
> + size_t count; /* after file limit checks */
> + int unaligned_aio = 0;
> + int overwrite = 0;
> + loff_t *ppos = &iocb->ki_pos;
> + loff_t endbyte;
> +
> + BUG_ON(iocb->ki_pos != pos);
>
> if (!is_sync_kiocb(iocb))
> unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> @@ -121,7 +133,129 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> ext4_aiodio_wait(inode);
> }
>
> - ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> + mutex_lock(&inode->i_mutex);
> + blk_start_plug(&plug);
> +
> + ocount = 0;
> + ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
> + if (ret)
> + goto unlock_out;
> +
> + count = ocount;
> + pos = *ppos;
> +
> + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
> +
> + /* We can write back this queue in page reclaim */
> + current->backing_dev_info = mapping->backing_dev_info;
> + written = 0;
> +
> + ret = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
> + if (ret)
> + goto out;
> +
> + if (count == 0)
> + goto out;
> +
> + ret = file_remove_suid(file);
> + if (ret)
> + goto out;
> +
> + file_update_time(file);
> +
> + iocb->private = NULL;
> +
> + if (!unaligned_aio && !file->f_mapping->nrpages &&
> + pos + length < i_size_read(inode) &&
should be pos + length <= ?
And inode->i_size should be ok since now we have i_mutex held.
> + ext4_should_dioread_nolock(inode)) {
> + struct ext4_map_blocks map;
> + unsigned int blkbits = inode->i_blkbits;
> + int err;
> + int len;
> +
> + map.m_lblk = pos >> blkbits;
> + map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
> + - map.m_lblk;
> + len = map.m_len;
> +
> + err = ext4_map_blocks(NULL, inode, &map, 0);
> + if (err == len && (!map.m_flags ||
> + map.m_flags & EXT4_MAP_MAPPED)) {
could you please add some comments about how and why map.m_flags are
checked this way?
> + overwrite = 1;
> + iocb->private = &overwrite;
> + mutex_unlock(&inode->i_mutex);
> + down_read(&EXT4_I(inode)->i_data_sem);
Is there any possibility that the metadata is changed after we dropped
the i_mutex before the down_read?
> + }
> + }
> +
> + if (file->f_mapping->nrpages && overwrite) {
> + overwrite = 0;
> + up_read(&EXT4_I(inode)->i_data_sem);
> + mutex_lock(&inode->i_mutex);
I am not sure whether it could happen. But if it does happen, should we
also change the value in iocb->private?
> + }
> +
> + written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
> + ppos, count, ocount);
> + if (written < 0 || written == count)
> + goto out;
> + /*
> + * direct-io write to a hole: fall through to buffered I/O
> + * for completing the rest of the request.
> + */
> + pos += written;
> + count -= written;
> + written_buffered = generic_file_buffered_write(iocb, iov,
> + nr_segs, pos, ppos, count,
> + written);
If we fall back here, should we re-lock the i_mutex since the buffer
write isn't guaranteed?
Thanks
Tao
> + /*
> + * If generic_file_buffered_write() retuned a synchronous error
> + * then we want to return the number of bytes which were
> + * direct-written, or the error code if that was zero. Note
> + * that this differs from normal direct-io semantics, which
> + * will return -EFOO even if some bytes were written.
> + */
> + if (written_buffered < 0) {
> + ret = written_buffered;
> + goto out;
> + }
> +
> + /*
> + * We need to ensure that the page cache pages are written to
> + * disk and invalidated to preserve the expected O_DIRECT
> + * semantics.
> + */
> + endbyte = pos + written_buffered - written - 1;
> + ret = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
> + if (ret == 0) {
> + written = written_buffered;
> + invalidate_mapping_pages(mapping,
> + pos >> PAGE_CACHE_SHIFT,
> + endbyte >> PAGE_CACHE_SHIFT);
> + } else {
> + /*
> + * We don't know how much we wrote, so just return
> + * the number of bytes which were direct-written
> + */
> + }
> +
> +out:
> + current->backing_dev_info = NULL;
> + ret = written ? written : ret;
> +
> +unlock_out:
> + if (overwrite)
> + up_read(&EXT4_I(inode)->i_data_sem);
> + else
> + mutex_unlock(&inode->i_mutex);
> +
> + if (ret > 0 || ret == -EIOCBQUEUED) {
> + ssize_t err;
> +
> + err = generic_write_sync(file, pos, ret);
> + if (err < 0 && ret > 0)
> + ret = err;
> + }
> + blk_finish_plug(&plug);
>
> if (unaligned_aio)
> mutex_unlock(ext4_aio_mutex(inode));
On Wed, May 02, 2012 at 02:59:34PM +0800, Tao Ma wrote:
> On 04/28/2012 11:39 AM, Zheng Liu wrote:
> > From: Zheng Liu <[email protected]>
> >
> > Aligned and overwrite direct IO can be parallelized. In ext4_file_dio_write,
> > we first check whether these conditions are satisfied or not. If so, we unlock
> > the i_mutex and acquire i_data_sem directly. Meanwhile iocb->private is set to
> > indicate that this is a overwrite dio, and it will be processed in
> > ext4_ext_direct_IO.
> >
> > Signed-off-by: Zheng Liu <[email protected]>
> > ---
> > fs/ext4/file.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > 1 files changed, 137 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> > index e5d6be3..8a5f713 100644
> > --- a/fs/ext4/file.c
> > +++ b/fs/ext4/file.c
> > @@ -100,9 +100,21 @@ static ssize_t
> > ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> > unsigned long nr_segs, loff_t pos)
> > {
> > - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> > - int unaligned_aio = 0;
> > + struct file *file = iocb->ki_filp;
> > + struct address_space * mapping = file->f_mapping;
> > + struct inode *inode = file->f_path.dentry->d_inode;
> > + struct blk_plug plug;
> > ssize_t ret;
> > + ssize_t written, written_buffered;
> > + size_t length = iov_length(iov, nr_segs);
> > + size_t ocount; /* original count */
> > + size_t count; /* after file limit checks */
> > + int unaligned_aio = 0;
> > + int overwrite = 0;
> > + loff_t *ppos = &iocb->ki_pos;
> > + loff_t endbyte;
> > +
> > + BUG_ON(iocb->ki_pos != pos);
> >
> > if (!is_sync_kiocb(iocb))
> > unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> > @@ -121,7 +133,129 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> > ext4_aiodio_wait(inode);
> > }
> >
> > - ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> > + mutex_lock(&inode->i_mutex);
> > + blk_start_plug(&plug);
> > +
> > + ocount = 0;
> > + ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
> > + if (ret)
> > + goto unlock_out;
> > +
> > + count = ocount;
> > + pos = *ppos;
> > +
> > + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
> > +
> > + /* We can write back this queue in page reclaim */
> > + current->backing_dev_info = mapping->backing_dev_info;
> > + written = 0;
> > +
> > + ret = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
> > + if (ret)
> > + goto out;
> > +
> > + if (count == 0)
> > + goto out;
> > +
> > + ret = file_remove_suid(file);
> > + if (ret)
> > + goto out;
> > +
> > + file_update_time(file);
> > +
> > + iocb->private = NULL;
> > +
> > + if (!unaligned_aio && !file->f_mapping->nrpages &&
> > + pos + length < i_size_read(inode) &&
> should be pos + length <= ?
> And inode->i_size should be ok since now we have i_mutex held.
Yes, you are right.
> > + ext4_should_dioread_nolock(inode)) {
> > + struct ext4_map_blocks map;
> > + unsigned int blkbits = inode->i_blkbits;
> > + int err;
> > + int len;
> > +
> > + map.m_lblk = pos >> blkbits;
> > + map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
> > + - map.m_lblk;
> > + len = map.m_len;
> > +
> > + err = ext4_map_blocks(NULL, inode, &map, 0);
> > + if (err == len && (!map.m_flags ||
> > + map.m_flags & EXT4_MAP_MAPPED)) {
> could you please add some comments about how and why map.m_flags are
> checked this way?
OK. I will add some comments to describe it in here.
> > + overwrite = 1;
> > + iocb->private = &overwrite;
> > + mutex_unlock(&inode->i_mutex);
> > + down_read(&EXT4_I(inode)->i_data_sem);
> Is there any possibility that the metadata is changed after we dropped
> the i_mutex before the down_read?
Yes, the metadata is possible to be changed after we unlocked i_mutex
before acquire i_data_sem. So I will swap the locking order.
> > + }
> > + }
> > +
> > + if (file->f_mapping->nrpages && overwrite) {
> > + overwrite = 0;
> > + up_read(&EXT4_I(inode)->i_data_sem);
> > + mutex_lock(&inode->i_mutex);
> I am not sure whether it could happen. But if it does happen, should we
> also change the value in iocb->private?
As I said above, if we swap the locking order, I think that it shouldn't
happen. Certainly, I will set 'iocb->private = NULL' to fix it to
ensure that when it does happen, we can make filesystem do right things.
> > + }
> > +
> > + written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
> > + ppos, count, ocount);
> > + if (written < 0 || written == count)
> > + goto out;
> > + /*
> > + * direct-io write to a hole: fall through to buffered I/O
> > + * for completing the rest of the request.
> > + */
> > + pos += written;
> > + count -= written;
> > + written_buffered = generic_file_buffered_write(iocb, iov,
> > + nr_segs, pos, ppos, count,
> > + written);
> If we fall back here, should we re-lock the i_mutex since the buffer
> write isn't guaranteed?
No, we don't need to re-lock i_mutex because dio never falls through to
buffered IO when it is an overwrite. We do a lookup using
ext4_map_blocks to ensure that it never occurs before we actually issue
a dio. I will add a BUG_ON to guarantee that it couldn't happen.
Regards,
Zheng
>
> Thanks
> Tao
> > + /*
> > + * If generic_file_buffered_write() retuned a synchronous error
> > + * then we want to return the number of bytes which were
> > + * direct-written, or the error code if that was zero. Note
> > + * that this differs from normal direct-io semantics, which
> > + * will return -EFOO even if some bytes were written.
> > + */
> > + if (written_buffered < 0) {
> > + ret = written_buffered;
> > + goto out;
> > + }
> > +
> > + /*
> > + * We need to ensure that the page cache pages are written to
> > + * disk and invalidated to preserve the expected O_DIRECT
> > + * semantics.
> > + */
> > + endbyte = pos + written_buffered - written - 1;
> > + ret = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
> > + if (ret == 0) {
> > + written = written_buffered;
> > + invalidate_mapping_pages(mapping,
> > + pos >> PAGE_CACHE_SHIFT,
> > + endbyte >> PAGE_CACHE_SHIFT);
> > + } else {
> > + /*
> > + * We don't know how much we wrote, so just return
> > + * the number of bytes which were direct-written
> > + */
> > + }
> > +
> > +out:
> > + current->backing_dev_info = NULL;
> > + ret = written ? written : ret;
> > +
> > +unlock_out:
> > + if (overwrite)
> > + up_read(&EXT4_I(inode)->i_data_sem);
> > + else
> > + mutex_unlock(&inode->i_mutex);
> > +
> > + if (ret > 0 || ret == -EIOCBQUEUED) {
> > + ssize_t err;
> > +
> > + err = generic_write_sync(file, pos, ret);
> > + if (err < 0 && ret > 0)
> > + ret = err;
> > + }
> > + blk_finish_plug(&plug);
> >
> > if (unaligned_aio)
> > mutex_unlock(ext4_aio_mutex(inode));
>
On 4/27/12 10:39 PM, Zheng Liu wrote:
> From: Zheng Liu <[email protected]>
>
> Aligned and overwrite direct IO can be parallelized. In ext4_file_dio_write,
> we first check whether these conditions are satisfied or not. If so, we unlock
> the i_mutex and acquire i_data_sem directly. Meanwhile iocb->private is set to
> indicate that this is a overwrite dio, and it will be processed in
> ext4_ext_direct_IO.
This copies almost 100 lines of generic_file_aio_write() back into
ext4. Do we really need to do this? Copying core code into the
fs can be a maintenance nightmare...
I'll have to think more about the big picture and whether or not it's
possible, but my first reaction is to find a way to leverage or modify
existing IO code rather than pasting it all into ext4 with changes...
-Eric
> Signed-off-by: Zheng Liu <[email protected]>
> ---
> fs/ext4/file.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 files changed, 137 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index e5d6be3..8a5f713 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -100,9 +100,21 @@ static ssize_t
> ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> unsigned long nr_segs, loff_t pos)
> {
> - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> - int unaligned_aio = 0;
> + struct file *file = iocb->ki_filp;
> + struct address_space * mapping = file->f_mapping;
> + struct inode *inode = file->f_path.dentry->d_inode;
> + struct blk_plug plug;
> ssize_t ret;
> + ssize_t written, written_buffered;
> + size_t length = iov_length(iov, nr_segs);
> + size_t ocount; /* original count */
> + size_t count; /* after file limit checks */
> + int unaligned_aio = 0;
> + int overwrite = 0;
> + loff_t *ppos = &iocb->ki_pos;
> + loff_t endbyte;
> +
> + BUG_ON(iocb->ki_pos != pos);
>
> if (!is_sync_kiocb(iocb))
> unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> @@ -121,7 +133,129 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> ext4_aiodio_wait(inode);
> }
>
> - ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> + mutex_lock(&inode->i_mutex);
> + blk_start_plug(&plug);
> +
> + ocount = 0;
> + ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
> + if (ret)
> + goto unlock_out;
> +
> + count = ocount;
> + pos = *ppos;
> +
> + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
> +
> + /* We can write back this queue in page reclaim */
> + current->backing_dev_info = mapping->backing_dev_info;
> + written = 0;
> +
> + ret = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
> + if (ret)
> + goto out;
> +
> + if (count == 0)
> + goto out;
> +
> + ret = file_remove_suid(file);
> + if (ret)
> + goto out;
> +
> + file_update_time(file);
> +
> + iocb->private = NULL;
> +
> + if (!unaligned_aio && !file->f_mapping->nrpages &&
> + pos + length < i_size_read(inode) &&
> + ext4_should_dioread_nolock(inode)) {
> + struct ext4_map_blocks map;
> + unsigned int blkbits = inode->i_blkbits;
> + int err;
> + int len;
> +
> + map.m_lblk = pos >> blkbits;
> + map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
> + - map.m_lblk;
> + len = map.m_len;
> +
> + err = ext4_map_blocks(NULL, inode, &map, 0);
> + if (err == len && (!map.m_flags ||
> + map.m_flags & EXT4_MAP_MAPPED)) {
> + overwrite = 1;
> + iocb->private = &overwrite;
> + mutex_unlock(&inode->i_mutex);
> + down_read(&EXT4_I(inode)->i_data_sem);
> + }
> + }
> +
> + if (file->f_mapping->nrpages && overwrite) {
> + overwrite = 0;
> + up_read(&EXT4_I(inode)->i_data_sem);
> + mutex_lock(&inode->i_mutex);
> + }
> +
> + written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
> + ppos, count, ocount);
> + if (written < 0 || written == count)
> + goto out;
> + /*
> + * direct-io write to a hole: fall through to buffered I/O
> + * for completing the rest of the request.
> + */
> + pos += written;
> + count -= written;
> + written_buffered = generic_file_buffered_write(iocb, iov,
> + nr_segs, pos, ppos, count,
> + written);
> + /*
> + * If generic_file_buffered_write() retuned a synchronous error
> + * then we want to return the number of bytes which were
> + * direct-written, or the error code if that was zero. Note
> + * that this differs from normal direct-io semantics, which
> + * will return -EFOO even if some bytes were written.
> + */
> + if (written_buffered < 0) {
> + ret = written_buffered;
> + goto out;
> + }
> +
> + /*
> + * We need to ensure that the page cache pages are written to
> + * disk and invalidated to preserve the expected O_DIRECT
> + * semantics.
> + */
> + endbyte = pos + written_buffered - written - 1;
> + ret = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
> + if (ret == 0) {
> + written = written_buffered;
> + invalidate_mapping_pages(mapping,
> + pos >> PAGE_CACHE_SHIFT,
> + endbyte >> PAGE_CACHE_SHIFT);
> + } else {
> + /*
> + * We don't know how much we wrote, so just return
> + * the number of bytes which were direct-written
> + */
> + }
> +
> +out:
> + current->backing_dev_info = NULL;
> + ret = written ? written : ret;
> +
> +unlock_out:
> + if (overwrite)
> + up_read(&EXT4_I(inode)->i_data_sem);
> + else
> + mutex_unlock(&inode->i_mutex);
> +
> + if (ret > 0 || ret == -EIOCBQUEUED) {
> + ssize_t err;
> +
> + err = generic_write_sync(file, pos, ret);
> + if (err < 0 && ret > 0)
> + ret = err;
> + }
> + blk_finish_plug(&plug);
>
> if (unaligned_aio)
> mutex_unlock(ext4_aio_mutex(inode));