Currently, if an AIO/DIO/O_SYNC write is issued, generic_write_sync is
called after the submission of the I/O instead of after the completion.
This can result in flushing the disk cache before the actual write even
gets there! This patch set attempts to fix that for xfs and ext4.
Comments would be greatly appreciated.
Cheers,
Jeff
[PATCH 1/3] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests
[PATCH 2/3] ext4: honor the O_SYNC flag for aysnchronous direct I/O requests
[PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED
Hi,
As it stands, generic_file_aio_write will call into generic_write_sync
when -EIOCBQUEUED is returned from __generic_file_aio_write. EIOCBQUEUED
indicates that an I/O was submitted but NOT completed. Thus, we will
flush the disk cache, potentially before the write(s) even make it to
the disk! Up until now, this has been the best we could do, as file
systems didn't bother to flush the disk cache after an O_SYNC AIO+DIO
write. After applying the prior two patches to xfs and ext4, at least
the major two file systems do the right thing. So, let's go ahead and
fix this backwards logic.
Signed-off-by: Jeff Moyer <[email protected]>
---
mm/filemap.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index c4ee2e9..004442f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2634,7 +2634,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
--
1.7.1
Hi,
If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
flushed after the write completion. Instead, it's flushed *before* the
I/O is sent to the disk (in __generic_file_aio_write). This patch
attempts to fix that problem by marking an I/O as requiring a cache
flush in endio processing. I'll send a follow-on patch to the
generic write code to get rid of the bogus generic_write_sync call
when EIOCBQUEUED is returned.
Signed-off-by: Jeff Moyer <[email protected]>
---
fs/xfs/xfs_aops.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++-
fs/xfs/xfs_aops.h | 1 +
fs/xfs/xfs_buf.c | 9 +++++++
3 files changed, 77 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee..909e020 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -158,6 +158,48 @@ xfs_setfilesize(
}
/*
+ * In the case of synchronous, AIO, O_DIRECT writes, we need to flush
+ * the disk cache when the I/O is complete.
+ */
+STATIC bool
+xfs_ioend_needs_cache_flush(
+ struct xfs_ioend *ioend)
+{
+ if (!ioend->io_isasync)
+ return false;
+
+ return (IS_SYNC(ioend->io_inode) ||
+ (ioend->io_iocb->ki_filp->f_flags & O_DSYNC));
+}
+
+STATIC void
+xfs_end_io_flush(
+ struct bio *bio,
+ int error)
+{
+ struct xfs_ioend *ioend = bio->bi_private;
+
+ if (error && ioend->io_result > 0)
+ ioend->io_result = error;
+
+ xfs_destroy_ioend(ioend);
+ bio_put(bio);
+}
+
+STATIC void
+xfs_ioend_flush_cache(
+ struct xfs_ioend *ioend)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_KERNEL, 0);
+ bio->bi_end_io = xfs_end_io_flush;
+ bio->bi_bdev = xfs_find_bdev_for_inode(ioend->io_inode);
+ bio->bi_private = ioend;
+ submit_bio(WRITE_FLUSH, bio);
+}
+
+/*
* Schedule IO completion handling on the final put of an ioend.
*
* If there is no work to do we might as well call it a day and free the
@@ -172,6 +214,8 @@ xfs_finish_ioend(
queue_work(xfsconvertd_workqueue, &ioend->io_work);
else if (xfs_ioend_is_append(ioend))
queue_work(xfsdatad_workqueue, &ioend->io_work);
+ else if (xfs_ioend_needs_cache_flush(ioend))
+ queue_work(xfsflushd_workqueue, &ioend->io_work);
else
xfs_destroy_ioend(ioend);
}
@@ -226,9 +270,30 @@ done:
xfs_finish_ioend(ioend);
/* ensure we don't spin on blocked ioends */
delay(1);
- } else {
+ } else if (xfs_ioend_needs_cache_flush(ioend)) {
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
+ int err;
+ int log_flushed = 0;
+
+ /*
+ * Check to see if we only need to sync data. If so,
+ * we can skip the log flush.
+ */
+ if (IS_SYNC(ioend->io_inode) ||
+ (ioend->io_iocb->ki_filp->f_flags & __O_SYNC)) {
+ err = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
+ if (err && ioend->io_result > 0)
+ ioend->io_result = err;
+ if (err || log_flushed) {
+ xfs_destroy_ioend(ioend);
+ return;
+ }
+ }
+ /* log not flushed or data sync only, flush the disk cache */
+ xfs_ioend_flush_cache(ioend);
+ } else
xfs_destroy_ioend(ioend);
- }
}
/*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 116dd5c..3f4a1c4 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -20,6 +20,7 @@
extern struct workqueue_struct *xfsdatad_workqueue;
extern struct workqueue_struct *xfsconvertd_workqueue;
+extern struct workqueue_struct *xfsflushd_workqueue;
extern mempool_t *xfs_ioend_pool;
/*
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4dff85c..39980a8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -47,6 +47,7 @@ STATIC int xfsbufd(void *);
static struct workqueue_struct *xfslogd_workqueue;
struct workqueue_struct *xfsdatad_workqueue;
struct workqueue_struct *xfsconvertd_workqueue;
+struct workqueue_struct *xfsflushd_workqueue;
#ifdef XFS_BUF_LOCK_TRACKING
# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
@@ -1802,8 +1803,15 @@ xfs_buf_init(void)
if (!xfsconvertd_workqueue)
goto out_destroy_xfsdatad_workqueue;
+ xfsflushd_workqueue = alloc_workqueue("xfsflushd",
+ WQ_MEM_RECLAIM, 1);
+ if (!xfsflushd_workqueue)
+ goto out_destroy_xfsconvertd_workqueue;
+
return 0;
+ out_destroy_xfsconvertd_workqueue:
+ destroy_workqueue(xfsconvertd_workqueue);
out_destroy_xfsdatad_workqueue:
destroy_workqueue(xfsdatad_workqueue);
out_destroy_xfslogd_workqueue:
@@ -1817,6 +1825,7 @@ xfs_buf_init(void)
void
xfs_buf_terminate(void)
{
+ destroy_workqueue(xfsflushd_workqueue);
destroy_workqueue(xfsconvertd_workqueue);
destroy_workqueue(xfsdatad_workqueue);
destroy_workqueue(xfslogd_workqueue);
--
1.7.1
Hi,
If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
flushed after the write completion. Instead, it's flushed *before* the
I/O is sent to the disk (in __generic_file_aio_write). This patch
attempts to fix that problem by marking an I/O as requiring a cache
flush in endio processing. I'll send a follow-on patch to the
generic write code to get rid of the bogus generic_write_sync call
when EIOCBQUEUED is returned.
Signed-off-by: Jeff Moyer <[email protected]>
---
fs/ext4/ext4.h | 4 ++++
fs/ext4/inode.c | 11 +++++++++--
fs/ext4/page-io.c | 39 ++++++++++++++++++++++++++++++++-------
fs/ext4/super.c | 11 +++++++++++
4 files changed, 56 insertions(+), 9 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2d55d7c..4377ed3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -185,6 +185,7 @@ struct mpage_da_data {
#define EXT4_IO_END_ERROR 0x0002
#define EXT4_IO_END_QUEUED 0x0004
#define EXT4_IO_END_DIRECT 0x0008
+#define EXT4_IO_END_NEEDS_SYNC 0x0010
struct ext4_io_page {
struct page *p_page;
@@ -1247,6 +1248,9 @@ struct ext4_sb_info {
/* workqueue for dio unwritten */
struct workqueue_struct *dio_unwritten_wq;
+ /* workqueue for aio+dio+o_sync disk cache flushing */
+ struct workqueue_struct *aio_dio_flush_wq;
+
/* timer for periodic error stats printing */
struct timer_list s_err_report;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f6dc02b..13cdb4c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2769,8 +2769,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
iocb->private = NULL;
+ /* AIO+DIO+O_SYNC I/Os need a cache flush after completion */
+ if (is_async && (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_DSYNC)))
+ io_end->flag |= EXT4_IO_END_NEEDS_SYNC;
+
/* if not aio dio with unwritten extents, just free io and return */
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ if (!(io_end->flag & (EXT4_IO_END_UNWRITTEN|EXT4_IO_END_NEEDS_SYNC))) {
ext4_free_io_end(io_end);
out:
if (is_async)
@@ -2785,7 +2789,10 @@ out:
io_end->iocb = iocb;
io_end->result = ret;
}
- wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN)
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+ else
+ wq = EXT4_SB(io_end->inode->i_sb)->aio_dio_flush_wq;
/* Add the io_end to per-inode completed aio dio list*/
ei = EXT4_I(io_end->inode);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 9e1b8eb..d07cd40 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -98,15 +98,40 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);
- ret = ext4_convert_unwritten_extents(inode, offset, size);
- if (ret < 0) {
- ext4_msg(inode->i_sb, KERN_EMERG,
- "failed to convert unwritten extents to written "
- "extents -- potential data loss! "
- "(inode %lu, offset %llu, size %zd, error %d)",
- inode->i_ino, offset, size, ret);
+ if (io->flag & EXT4_IO_END_UNWRITTEN) {
+
+ ret = ext4_convert_unwritten_extents(inode, offset, size);
+ if (ret < 0) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "failed to convert unwritten extents to "
+ "written extents -- potential data loss! "
+ "(inode %lu, offset %llu, size %zd, error %d)",
+ inode->i_ino, offset, size, ret);
+ goto endio;
+ }
+ }
+
+ /*
+ * This function has two callers. The first is the end_io_work
+ * routine just below. This is an asynchronous completion context.
+ * The second is in the fsync path. For the latter path, we can't
+ * return from here until the job is done. Hence, we issue a
+ * blocking blkdev_issue_flush call.
+ */
+ if (io->flag & EXT4_IO_END_NEEDS_SYNC) {
+ /*
+ * Ideally, we'd like to know if the force_commit routine
+ * actually did send something to disk. If it didn't,
+ * then we need to issue the cache flush by hand. For now,
+ * play it safe and do both.
+ */
+ ret = ext4_force_commit(inode->i_sb);
+ if (ret)
+ goto endio;
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
}
+endio:
if (io->iocb)
aio_complete(io->iocb, io->result, 0);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 502c61f..a24938e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -805,6 +805,7 @@ static void ext4_put_super(struct super_block *sb)
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
flush_workqueue(sbi->dio_unwritten_wq);
+ destroy_workqueue(sbi->aio_dio_flush_wq);
destroy_workqueue(sbi->dio_unwritten_wq);
lock_super(sb);
@@ -3718,6 +3719,13 @@ no_journal:
goto failed_mount_wq;
}
+ EXT4_SB(sb)->aio_dio_flush_wq =
+ alloc_workqueue("ext4-aio-dio-flush", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!EXT4_SB(sb)->aio_dio_flush_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create flush workqueue\n");
+ goto failed_flush_wq;
+ }
+
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
@@ -3840,6 +3848,8 @@ failed_mount4a:
sb->s_root = NULL;
failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
+ destroy_workqueue(EXT4_SB(sb)->aio_dio_flush_wq);
+failed_flush_wq:
destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
failed_mount_wq:
if (sbi->s_journal) {
@@ -4303,6 +4313,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->dio_unwritten_wq);
+ flush_workqueue(sbi->aio_dio_flush_wq);
if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
jbd2_log_wait_commit(sbi->s_journal, target);
--
1.7.1
This looks pretty good. Did this past xfstests? I'd also like to add
tests actually executing this code path just, to be sure. E.g. variants
of aio-stress actually using O_SYNC. We can't easily test data really
made it to disk that way, although at least we make sure the code
doesn't break.
On Fri, Jan 27, 2012 at 04:15:47PM -0500, Jeff Moyer wrote:
> Hi,
>
> If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
> flushed after the write completion. Instead, it's flushed *before* the
> I/O is sent to the disk (in __generic_file_aio_write).
XFS doesn't actually use __generic_file_aio_write, so this sentence
isn't correct for XFS.
> + } else if (xfs_ioend_needs_cache_flush(ioend)) {
> + struct xfs_inode *ip = XFS_I(ioend->io_inode);
> + struct xfs_mount *mp = ip->i_mount;
> + int err;
> + int log_flushed = 0;
> +
> + /*
> + * Check to see if we only need to sync data. If so,
> + * we can skip the log flush.
> + */
> + if (IS_SYNC(ioend->io_inode) ||
> + (ioend->io_iocb->ki_filp->f_flags & __O_SYNC)) {
> + err = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
Can you add a TODO comment that this actually is synchronous and thus
will block the I/O completion work queue?
Also you can use _xfs_log_force_lsn here as don't need to flush the
whole log, just up to the last lsn that touched the inode. Copy, or
better factor the code from xfs_dir_fsync for that.
Last but not least this won't catch timestamp updates. Given that I'm
about to send a series making timestamp updates transaction I would not
recommend you to bother with that, but if you want to take a look
at how xfs_file_fsync deals with them. Given that this series touches
the same area I'd also like to take your xfs patch in through the xfs tree
to avoid conflicts.
> @@ -47,6 +47,7 @@ STATIC int xfsbufd(void *);
> static struct workqueue_struct *xfslogd_workqueue;
> struct workqueue_struct *xfsdatad_workqueue;
> struct workqueue_struct *xfsconvertd_workqueue;
> +struct workqueue_struct *xfsflushd_workqueue;
>
> #ifdef XFS_BUF_LOCK_TRACKING
> # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
> @@ -1802,8 +1803,15 @@ xfs_buf_init(void)
> if (!xfsconvertd_workqueue)
> goto out_destroy_xfsdatad_workqueue;
>
> + xfsflushd_workqueue = alloc_workqueue("xfsflushd",
> + WQ_MEM_RECLAIM, 1);
This should allow a higher concurrently level, it's probably a good
idea to pass 0 and use the default.
_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs
Adding linux-btrfs to Cc.
Am Freitag, 27. Januar 2012 schrieb Jeff Moyer:
> Hi,
Hi,
> As it stands, generic_file_aio_write will call into generic_write_sync
> when -EIOCBQUEUED is returned from __generic_file_aio_write.
> EIOCBQUEUED indicates that an I/O was submitted but NOT completed.
> Thus, we will flush the disk cache, potentially before the write(s)
> even make it to the disk! Up until now, this has been the best we
> could do, as file systems didn't bother to flush the disk cache after
> an O_SYNC AIO+DIO write. After applying the prior two patches to xfs
> and ext4, at least the major two file systems do the right thing. So,
> let's go ahead and fix this backwards logic.
Would this need an adaption to BTRFS as well?
Thanks,
Martin
> Signed-off-by: Jeff Moyer <[email protected]>
> ---
> mm/filemap.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index c4ee2e9..004442f 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2634,7 +2634,7 @@ ssize_t generic_file_aio_write(struct kiocb
> *iocb, const struct iovec *iov, ret = __generic_file_aio_write(iocb,
> iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex);
>
> - if (ret > 0 || ret == -EIOCBQUEUED) {
> + if (ret > 0) {
> ssize_t err;
>
> err = generic_write_sync(file, pos, ret);
--
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA B82F 991B EAAC A599 84C7
_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs
Hi,
On Fri 27-01-12 16:15:48, Jeff Moyer wrote:
> If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
> flushed after the write completion. Instead, it's flushed *before* the
> I/O is sent to the disk (in __generic_file_aio_write). This patch
> attempts to fix that problem by marking an I/O as requiring a cache
> flush in endio processing. I'll send a follow-on patch to the
> generic write code to get rid of the bogus generic_write_sync call
> when EIOCBQUEUED is returned.
Thanks for the patch!
> Signed-off-by: Jeff Moyer <[email protected]>
> ---
> fs/ext4/ext4.h | 4 ++++
> fs/ext4/inode.c | 11 +++++++++--
> fs/ext4/page-io.c | 39 ++++++++++++++++++++++++++++++++-------
> fs/ext4/super.c | 11 +++++++++++
> 4 files changed, 56 insertions(+), 9 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 2d55d7c..4377ed3 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -185,6 +185,7 @@ struct mpage_da_data {
> #define EXT4_IO_END_ERROR 0x0002
> #define EXT4_IO_END_QUEUED 0x0004
> #define EXT4_IO_END_DIRECT 0x0008
> +#define EXT4_IO_END_NEEDS_SYNC 0x0010
>
> struct ext4_io_page {
> struct page *p_page;
> @@ -1247,6 +1248,9 @@ struct ext4_sb_info {
> /* workqueue for dio unwritten */
> struct workqueue_struct *dio_unwritten_wq;
>
> + /* workqueue for aio+dio+o_sync disk cache flushing */
> + struct workqueue_struct *aio_dio_flush_wq;
> +
Hmm, looking at the patch I'm wondering why did you introduce the new
workqueue? It seems dio_unwritten_wq would be enough? You just need to
rename it to something more appropriate ;)
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 9e1b8eb..d07cd40 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -98,15 +98,40 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
> "list->prev 0x%p\n",
> io, inode->i_ino, io->list.next, io->list.prev);
>
> - ret = ext4_convert_unwritten_extents(inode, offset, size);
> - if (ret < 0) {
> - ext4_msg(inode->i_sb, KERN_EMERG,
> - "failed to convert unwritten extents to written "
> - "extents -- potential data loss! "
> - "(inode %lu, offset %llu, size %zd, error %d)",
> - inode->i_ino, offset, size, ret);
> + if (io->flag & EXT4_IO_END_UNWRITTEN) {
> +
> + ret = ext4_convert_unwritten_extents(inode, offset, size);
> + if (ret < 0) {
> + ext4_msg(inode->i_sb, KERN_EMERG,
> + "failed to convert unwritten extents to "
> + "written extents -- potential data loss! "
> + "(inode %lu, offset %llu, size %zd, error %d)",
> + inode->i_ino, offset, size, ret);
> + goto endio;
> + }
> + }
> +
> + /*
> + * This function has two callers. The first is the end_io_work
> + * routine just below. This is an asynchronous completion context.
> + * The second is in the fsync path. For the latter path, we can't
> + * return from here until the job is done. Hence, we issue a
> + * blocking blkdev_issue_flush call.
> + */
> + if (io->flag & EXT4_IO_END_NEEDS_SYNC) {
> + /*
> + * Ideally, we'd like to know if the force_commit routine
> + * actually did send something to disk. If it didn't,
> + * then we need to issue the cache flush by hand. For now,
> + * play it safe and do both.
> + */
> + ret = ext4_force_commit(inode->i_sb);
> + if (ret)
> + goto endio;
> + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
Look at what ext4_sync_file() does. It's more efficient than this.
You need something like:
commit_tid = file->f_flags & __O_SYNC ? EXT4_I(inode)->i_sync_tid :
EXT4_I(inode)->i_datasync_tid;
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_trans_will_send_data_barrier(journal, commit_tid))
needs_barrier = true;
jbd2_log_start_commit(journal, commit_tid);
jbd2_log_wait_commit(journal, commit_tid);
if (needs_barrier)
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
Hello,
On Fri 27-01-12 16:15:49, Jeff Moyer wrote:
> As it stands, generic_file_aio_write will call into generic_write_sync
> when -EIOCBQUEUED is returned from __generic_file_aio_write. EIOCBQUEUED
> indicates that an I/O was submitted but NOT completed. Thus, we will
> flush the disk cache, potentially before the write(s) even make it to
> the disk!
Yeah. It seems to be a problem introduced by Tejun's rewrite of barrier
code, right? Before that we'd drain the IO queue when cache flush is issued
and thus effectively wait for IO completion...
> Up until now, this has been the best we could do, as file
> systems didn't bother to flush the disk cache after an O_SYNC AIO+DIO
> write. After applying the prior two patches to xfs and ext4, at least
> the major two file systems do the right thing. So, let's go ahead and
> fix this backwards logic.
But doesn't this break filesystems which you didn't fix explicitely even
more than they were? You are right they might have sent cache flush too
early but they'd at least propely force all metadata modifications (e.g.
from allocation) to disk. But after this patch O_SYNC will have simply no
effect for these filesystems.
Also I was thinking whether we couldn't implement the fix in VFS. Basically
it would be the same like the fix for ext4. Like having a per-sb workqueue
and queue work calling generic_write_sync() from end_io handler when the
file is O_SYNC? That would solve the issue for all filesystems...
Honza
> Signed-off-by: Jeff Moyer <[email protected]>
> ---
> mm/filemap.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index c4ee2e9..004442f 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2634,7 +2634,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
> ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
> mutex_unlock(&inode->i_mutex);
>
> - if (ret > 0 || ret == -EIOCBQUEUED) {
> + if (ret > 0) {
> ssize_t err;
>
> err = generic_write_sync(file, pos, ret);
> --
> 1.7.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Jan Kara <[email protected]>
SUSE Labs, CR
Jan Kara <[email protected]> writes:
>> + /* workqueue for aio+dio+o_sync disk cache flushing */
>> + struct workqueue_struct *aio_dio_flush_wq;
>> +
> Hmm, looking at the patch I'm wondering why did you introduce the new
> workqueue? It seems dio_unwritten_wq would be enough? You just need to
> rename it to something more appropriate ;)
I used a new workqueue as the operations are blocking, and I didn't want
to hold up other progress. If you think re-using the unwritten_wq is
the right thing to do, I'm happy to comply.
>> + /*
>> + * This function has two callers. The first is the end_io_work
>> + * routine just below. This is an asynchronous completion context.
>> + * The second is in the fsync path. For the latter path, we can't
>> + * return from here until the job is done. Hence, we issue a
>> + * blocking blkdev_issue_flush call.
>> + */
>> + if (io->flag & EXT4_IO_END_NEEDS_SYNC) {
>> + /*
>> + * Ideally, we'd like to know if the force_commit routine
>> + * actually did send something to disk. If it didn't,
>> + * then we need to issue the cache flush by hand. For now,
>> + * play it safe and do both.
>> + */
>> + ret = ext4_force_commit(inode->i_sb);
>> + if (ret)
>> + goto endio;
>> + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
> Look at what ext4_sync_file() does. It's more efficient than this.
> You need something like:
> commit_tid = file->f_flags & __O_SYNC ? EXT4_I(inode)->i_sync_tid :
> EXT4_I(inode)->i_datasync_tid;
> if (journal->j_flags & JBD2_BARRIER &&
> !jbd2_trans_will_send_data_barrier(journal, commit_tid))
> needs_barrier = true;
> jbd2_log_start_commit(journal, commit_tid);
> jbd2_log_wait_commit(journal, commit_tid);
> if (needs_barrier)
> blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
Great, thanks for the pointer!
Cheers,
Jeff
Jan Kara <[email protected]> writes:
> Hello,
>
> On Fri 27-01-12 16:15:49, Jeff Moyer wrote:
>> As it stands, generic_file_aio_write will call into generic_write_sync
>> when -EIOCBQUEUED is returned from __generic_file_aio_write. EIOCBQUEUED
>> indicates that an I/O was submitted but NOT completed. Thus, we will
>> flush the disk cache, potentially before the write(s) even make it to
>> the disk!
> Yeah. It seems to be a problem introduced by Tejun's rewrite of barrier
> code, right? Before that we'd drain the IO queue when cache flush is issued
> and thus effectively wait for IO completion...
Right, though hch seems to think even then the problem existed.
>> Up until now, this has been the best we could do, as file
>> systems didn't bother to flush the disk cache after an O_SYNC AIO+DIO
>> write. After applying the prior two patches to xfs and ext4, at least
>> the major two file systems do the right thing. So, let's go ahead and
>> fix this backwards logic.
> But doesn't this break filesystems which you didn't fix explicitely even
> more than they were? You are right they might have sent cache flush too
> early but they'd at least propely force all metadata modifications (e.g.
> from allocation) to disk. But after this patch O_SYNC will have simply no
> effect for these filesystems.
Yep. Note that we're calling into generic_write_sync with a negative
value. I followed that call chain all the way down and convinced myself
that it was "mostly harmless," but it sure as heck ain't right. I'll
audit other file systems to see whether it's a problem. btrfs, at
least, isn't affected by this.
> Also I was thinking whether we couldn't implement the fix in VFS. Basically
> it would be the same like the fix for ext4. Like having a per-sb workqueue
> and queue work calling generic_write_sync() from end_io handler when the
> file is O_SYNC? That would solve the issue for all filesystems...
Well, that would require buy-in from the other file system developers.
What do the XFS folks think?
Cheers,
Jeff
On Mon 06-02-12 11:20:29, Jeff Moyer wrote:
> Jan Kara <[email protected]> writes:
>
> >> + /* workqueue for aio+dio+o_sync disk cache flushing */
> >> + struct workqueue_struct *aio_dio_flush_wq;
> >> +
> > Hmm, looking at the patch I'm wondering why did you introduce the new
> > workqueue? It seems dio_unwritten_wq would be enough? You just need to
> > rename it to something more appropriate ;)
>
> I used a new workqueue as the operations are blocking, and I didn't want
> to hold up other progress. If you think re-using the unwritten_wq is
> the right thing to do, I'm happy to comply.
Ah, ok. Thinking about it, it's probably better to use a separate work
queue then.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mon, Feb 06, 2012 at 11:33:29AM -0500, Jeff Moyer wrote:
> > code, right? Before that we'd drain the IO queue when cache flush is issued
> > and thus effectively wait for IO completion...
>
> Right, though hch seems to think even then the problem existed.
I was wrong, using -o barrier it didn't. That was however not something
people using O_SYNC heavy production loads would do, they'd use disabled
caches and nobarrier.
> > Also I was thinking whether we couldn't implement the fix in VFS. Basically
> > it would be the same like the fix for ext4. Like having a per-sb workqueue
> > and queue work calling generic_write_sync() from end_io handler when the
> > file is O_SYNC? That would solve the issue for all filesystems...
>
> Well, that would require buy-in from the other file system developers.
> What do the XFS folks think?
I don't think using that code for XFS makes sene. But just like
generic_write_sync there's no reason it can't be added to generic code,
just make sure only generic_file_aio_write/__generic_file_aio_write use
it, but generic_file_buffered_write and generic_file_direct_write stay
clear of it.
Christoph Hellwig <[email protected]> writes:
> On Mon, Feb 06, 2012 at 11:33:29AM -0500, Jeff Moyer wrote:
>> > code, right? Before that we'd drain the IO queue when cache flush is issued
>> > and thus effectively wait for IO completion...
>>
>> Right, though hch seems to think even then the problem existed.
>
> I was wrong, using -o barrier it didn't. That was however not something
> people using O_SYNC heavy production loads would do, they'd use disabled
> caches and nobarrier.
>
>> > Also I was thinking whether we couldn't implement the fix in VFS. Basically
>> > it would be the same like the fix for ext4. Like having a per-sb workqueue
>> > and queue work calling generic_write_sync() from end_io handler when the
>> > file is O_SYNC? That would solve the issue for all filesystems...
>>
>> Well, that would require buy-in from the other file system developers.
>> What do the XFS folks think?
>
> I don't think using that code for XFS makes sene. But just like
> generic_write_sync there's no reason it can't be added to generic code,
> just make sure only generic_file_aio_write/__generic_file_aio_write use
> it, but generic_file_buffered_write and generic_file_direct_write stay
> clear of it.
ext4_file_write (ext4's .aio_write routine) calls into
generic_file_aio_write. So, I don't think we can generalize that this
routine means that the file system doesn't install its own endio
handler. What's more, we'd have to pass an endio routine down the call
stack quite a ways. In all, I think that would be an uglier solution to
the problem. Did I miss something?
Cheers,
Jeff
Jan Kara <[email protected]> writes:
> Look at what ext4_sync_file() does. It's more efficient than this.
> You need something like:
> commit_tid = file->f_flags & __O_SYNC ? EXT4_I(inode)->i_sync_tid :
> EXT4_I(inode)->i_datasync_tid;
> if (journal->j_flags & JBD2_BARRIER &&
> !jbd2_trans_will_send_data_barrier(journal, commit_tid))
> needs_barrier = true;
> jbd2_log_start_commit(journal, commit_tid);
> jbd2_log_wait_commit(journal, commit_tid);
> if (needs_barrier)
> blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
If the transaction won't send a data barrier, wouldn't you want to issue
the flush on the data device prior to commiting the transaction, not
after it?
-Jeff
On Tue 07-02-12 15:39:06, Jeff Moyer wrote:
> Christoph Hellwig <[email protected]> writes:
> > On Mon, Feb 06, 2012 at 11:33:29AM -0500, Jeff Moyer wrote:
> >> > code, right? Before that we'd drain the IO queue when cache flush is issued
> >> > and thus effectively wait for IO completion...
> >>
> >> Right, though hch seems to think even then the problem existed.
> >
> > I was wrong, using -o barrier it didn't. That was however not something
> > people using O_SYNC heavy production loads would do, they'd use disabled
> > caches and nobarrier.
> >
> >> > Also I was thinking whether we couldn't implement the fix in VFS. Basically
> >> > it would be the same like the fix for ext4. Like having a per-sb workqueue
> >> > and queue work calling generic_write_sync() from end_io handler when the
> >> > file is O_SYNC? That would solve the issue for all filesystems...
> >>
> >> Well, that would require buy-in from the other file system developers.
> >> What do the XFS folks think?
> >
> > I don't think using that code for XFS makes sene. But just like
> > generic_write_sync there's no reason it can't be added to generic code,
> > just make sure only generic_file_aio_write/__generic_file_aio_write use
> > it, but generic_file_buffered_write and generic_file_direct_write stay
> > clear of it.
>
> ext4_file_write (ext4's .aio_write routine) calls into
> generic_file_aio_write. So, I don't think we can generalize that this
> routine means that the file system doesn't install its own endio
> handler. What's more, we'd have to pass an endio routine down the call
> stack quite a ways. In all, I think that would be an uglier solution to
> the problem. Did I miss something?
I think it can be done in a relatively elegant way. POC patch (completely
untested) is attached. What do you think? All filesystems using
blockdev_direct_IO() can be easily converted to use this, gfs2 & ocfs2 can
also use the framework. That leaves only ext4, xfs & btrfs which need
special handling. Actually, maybe btrfs could be converted as well because
it doesn't seem to need to offload anything else to workqueue. But I'm not
really sure...
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
Jan Kara <[email protected]> writes:
> On Tue 07-02-12 15:39:06, Jeff Moyer wrote:
>> Christoph Hellwig <[email protected]> writes:
>> > On Mon, Feb 06, 2012 at 11:33:29AM -0500, Jeff Moyer wrote:
>> >> > code, right? Before that we'd drain the IO queue when cache flush is issued
>> >> > and thus effectively wait for IO completion...
>> >>
>> >> Right, though hch seems to think even then the problem existed.
>> >
>> > I was wrong, using -o barrier it didn't. That was however not something
>> > people using O_SYNC heavy production loads would do, they'd use disabled
>> > caches and nobarrier.
>> >
>> >> > Also I was thinking whether we couldn't implement the fix in VFS. Basically
>> >> > it would be the same like the fix for ext4. Like having a per-sb workqueue
>> >> > and queue work calling generic_write_sync() from end_io handler when the
>> >> > file is O_SYNC? That would solve the issue for all filesystems...
>> >>
>> >> Well, that would require buy-in from the other file system developers.
>> >> What do the XFS folks think?
>> >
>> > I don't think using that code for XFS makes sene. But just like
>> > generic_write_sync there's no reason it can't be added to generic code,
>> > just make sure only generic_file_aio_write/__generic_file_aio_write use
>> > it, but generic_file_buffered_write and generic_file_direct_write stay
>> > clear of it.
>>
>> ext4_file_write (ext4's .aio_write routine) calls into
>> generic_file_aio_write. So, I don't think we can generalize that this
>> routine means that the file system doesn't install its own endio
>> handler. What's more, we'd have to pass an endio routine down the call
>> stack quite a ways. In all, I think that would be an uglier solution to
>> the problem. Did I miss something?
> I think it can be done in a relatively elegant way. POC patch (completely
> untested) is attached. What do you think? All filesystems using
> blockdev_direct_IO() can be easily converted to use this, gfs2 & ocfs2 can
> also use the framework. That leaves only ext4, xfs & btrfs which need
> special handling. Actually, maybe btrfs could be converted as well because
> it doesn't seem to need to offload anything else to workqueue. But I'm not
> really sure...
I like it!
-Jeff
On Wed 08-02-12 10:11:47, Jeff Moyer wrote:
> Jan Kara <[email protected]> writes:
>
> > Look at what ext4_sync_file() does. It's more efficient than this.
> > You need something like:
> > commit_tid = file->f_flags & __O_SYNC ? EXT4_I(inode)->i_sync_tid :
> > EXT4_I(inode)->i_datasync_tid;
> > if (journal->j_flags & JBD2_BARRIER &&
> > !jbd2_trans_will_send_data_barrier(journal, commit_tid))
> > needs_barrier = true;
> > jbd2_log_start_commit(journal, commit_tid);
> > jbd2_log_wait_commit(journal, commit_tid);
> > if (needs_barrier)
> > blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
>
> If the transaction won't send a data barrier, wouldn't you want to issue
> the flush on the data device prior to commiting the transaction, not
> after it?
Sorry for late reply. I was thinking about this because the answer isn't
simple... One certain fact is that once ext4_convert_unwritten_extents()
finishes (calls ext4_journal_stop()), the transaction with metadata updates
can commit so whether we place flush before or after jbd2_log_start_commit()
makes no difference. For filesystems where journal is on the filesystem
device, the code should work correctly as is - journalling code will issue
the barrier before transaction commit is done and if there is no
transaction to commit, the place where we issue cache flush does not matter.
But for filesystems where journal is on separate device we indeed need to
issue the flush before the transaction commit is finished so that we
don't expose uninitialized data after crash.
Anyway that's a separate (although related) issue to the one which you fix
in this patch so you can leave the patch as is and I'll fixup the above
problem in a separate patch. Thanks for noticing this!
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR