There are number of races exist caused by lack of synchronization
between DIO/AIO workers and truncate/fsync/punch_hole routines
This patch series try to optimize and fix existing DIO/AIO code
I observe significant performance improvements for big SMP hosts.
Testcases which helps me to catch this type of bugs was posted here
http://www.spinics.net/lists/linux-fsdevel/msg58312.html
or available on my github repo:
https://github.com/dmonakhov/xfstests/tree/ce8e3adab629b2a9be8ba2e73db7dad49eb46614
plese run 286,287,288
TOC:
# first two are cleanups
ext4: ext4_inode_info diet
ext4: give i_aiodio_unwritten more appropriate name
# Bugfixes and improvements
ext4: fix unwritten counter leakage
ext4: completed_io locking cleanup V4
ext4: remove ext4_end_io()
ext4: serialize dio nonlocked reads with defrag workers V3
ext4: serialize unlocked dio reads with truncate
ext4: endless truncate due to nonlocked dio readers V2
ext4: serialize truncate with owerwrite DIO workers V2
# Nasty panch_hole regressions
ext4: punch_hole should wait for DIO writers V2
ext4: fix ext_remove_space for punch_hole case
Changes from V3:
- satisfy ./checkpatch.pl rules
- Rework 'completed_io locking cleanup'
- fix mistype in 'serialize dio nonlocked reads with defrag workers V3'
- add extra inode's flag checks to 'punch_hole should wait for DIO'
Changes from V2:
Fix use-after-free for queued end_io_work.
fs/ext4/ext4.h | 36 +++++++-
fs/ext4/extents.c | 92 ++++++++++++++--------
fs/ext4/file.c | 6 -
fs/ext4/fsync.c | 81 --------------------
fs/ext4/indirect.c | 18 +++-
fs/ext4/inode.c | 49 +++++-------
fs/ext4/move_extent.c | 8 +
fs/ext4/page-io.c | 202 ++++++++++++++++++++++++++++++--------------------
fs/ext4/super.c | 3
9 files changed, 262 insertions(+), 235 deletions(-)
Signed-off-by: Dmitry Monakhov <[email protected]>
AIO/DIO prefix is wrong because it account unwritten extents which
also may be scheduled from buffered write endio
Reviewed-by: Jan Kara <[email protected]>
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/ext4.h | 4 ++--
fs/ext4/file.c | 6 +++---
fs/ext4/page-io.c | 2 +-
fs/ext4/super.c | 2 +-
4 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 42be3ae..0d393dc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -917,7 +917,7 @@ struct ext4_inode_info {
struct list_head i_completed_io_list;
spinlock_t i_completed_io_lock;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
- atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
+ atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -1337,7 +1337,7 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
io_end->flag |= EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+ atomic_inc(&EXT4_I(inode)->i_unwritten);
}
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index cfdf774..40c02c6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,11 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
-static void ext4_aiodio_wait(struct inode *inode)
+static void ext4_unwritten_wait(struct inode *inode)
{
wait_queue_head_t *wq = ext4_ioend_wq(inode);
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+ wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
}
/*
@@ -116,7 +116,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
"performance will be poor.",
inode->i_ino, current->comm);
mutex_lock(ext4_aio_mutex(inode));
- ext4_aiodio_wait(inode);
+ ext4_unwritten_wait(inode);
}
BUG_ON(iocb->ki_pos != pos);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dcdeef1..de77e31 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -113,7 +113,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
if (io->flag & EXT4_IO_END_DIRECT)
inode_dio_done(inode);
/* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(io->inode));
return ret;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b5dfff8..f2d928c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -970,7 +970,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
- atomic_set(&ei->i_aiodio_unwritten, 0);
+ atomic_set(&ei->i_unwritten, 0);
return &ei->vfs_inode;
}
--
1.7.7.6
Generic inode has unused i_private pointer which may be used as cur_aio_dio
storage.
TODO: If cur_aio_dio will be passed as an argument to get_block_t this allow
to have concurent AIO_DIO requests.
Reviewed-by: Zheng Liu <[email protected]>
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/ext4.h | 12 ++++++++++--
fs/ext4/extents.c | 4 ++--
fs/ext4/inode.c | 6 +++---
fs/ext4/super.c | 1 -
4 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6f37c11..42be3ae 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -917,8 +917,6 @@ struct ext4_inode_info {
struct list_head i_completed_io_list;
spinlock_t i_completed_io_lock;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
- /* current io_end structure for async DIO write*/
- ext4_io_end_t *cur_aio_dio;
atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -1343,6 +1341,16 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
}
}
+static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
+{
+ return inode->i_private;
+}
+
+static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
+{
+ inode->i_private = io;
+}
+
/*
* Inode dynamic state flags
*/
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c29379d..e9549f9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3530,7 +3530,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
{
int ret = 0;
int err = 0;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
"block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -3788,7 +3788,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
ext4_lblk_t cluster_offset;
ext_debug("blocks %u/%u requested for inode %lu\n",
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ad568b8..3a28cf7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3028,7 +3028,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* hook to the iocb.
*/
iocb->private = NULL;
- EXT4_I(inode)->cur_aio_dio = NULL;
+ ext4_inode_aio_set(inode, NULL);
if (!is_sync_kiocb(iocb)) {
ext4_io_end_t *io_end =
ext4_init_io_end(inode, GFP_NOFS);
@@ -3045,7 +3045,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* is a unwritten extents needs to be converted
* when IO is completed.
*/
- EXT4_I(inode)->cur_aio_dio = iocb->private;
+ ext4_inode_aio_set(inode, io_end);
}
if (overwrite)
@@ -3065,7 +3065,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
NULL,
DIO_LOCKING);
if (iocb->private)
- EXT4_I(inode)->cur_aio_dio = NULL;
+ ext4_inode_aio_set(inode, NULL);
/*
* The io_end structure takes a reference to the inode,
* that structure needs to be destroyed and the
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d5f4c97..b5dfff8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -967,7 +967,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_completed_io_list);
spin_lock_init(&ei->i_completed_io_lock);
- ei->cur_aio_dio = NULL;
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
--
1.7.7.6
Current unwritten extent conversion state-machine is very fuzzy.
- By unknown reason it want perform conversion under i_mutex. What for?
My diagnosis:
We already protect extent tree with i_data_sem, truncate and punch_hole
should wait for DIO, so the only data we have to protect is end_io->flags
modification, but only flush_completed_IO and end_io_work modified this
flags and we can serialize them via i_completed_io_lock.
Currently all this games with mutex_trylock result in following deadlock
truncate: kworker:
ext4_setattr ext4_end_io_work
mutex_lock(i_mutex)
inode_dio_wait(inode) ->BLOCK
DEADLOCK<- mutex_trylock()
inode_dio_done()
#TEST_CASE1_BEGIN
MNT=/mnt_scrach
unlink $MNT/file
fallocate -l $((1024*1024*1024)) $MNT/file
aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
sleep 2
truncate -s 0 $MNT/file
#TEST_CASE1_END
Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286
This patch makes state machine simple and clean:
(1) xxx_end_io schedule final extent conversion simply by calling
ext4_add_complete_io(), which append it to ei->i_completed_io_list
NOTE1: because of (2A) work should be queued only if
->i_completed_io_list was empty, otherwise it work is scheduled already.
(2) ext4_flush_completed_IO is responsible for handling all pending
end_io from ei->i_completed_io_list
Flushing sequange consist of following stages:
A) LOCKED: Atomically drain completed_io_list to local_list
B) Perform extents conversion
C) LOCKED: move conveted io's to to_free list for final deletion
This logic depends on context which we was called from.
D) Final end_io context destruction
NOTE1: i_mutex is no longer required because end_io->flags modification
protected by ei->ext4_complete_io_lock
Full list of changes:
- Move all completion end_io related routines to page-io.c in order to improve
logic locality
- Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
- remove EXT4_IO_END_FSYNC
- Improve SMP scalability by removing useless i_mutex which does not
protect io->flags anymore.
- Reduce lock contention on i_completed_io_lock by optimizing list walk.
- Rename ext4_end_io_nolock to end4_end_io and make it static
- Check flush completion status to ext4_ext_punch_hole(). Because it is
not good idea to punch blocks from corrupted inode.
Changes since V3 (in request to Jan's comments):
Fall back to active flush_completed_IO() approach in order to prevent
performance issues with nolocked DIO reads.
Changes since V2:
Fix use-after-free caused by race truncate vs end_io_work
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/ext4.h | 3 +-
fs/ext4/extents.c | 4 +-
fs/ext4/fsync.c | 81 -------------------------
fs/ext4/indirect.c | 6 +-
fs/ext4/inode.c | 25 +-------
fs/ext4/page-io.c | 171 ++++++++++++++++++++++++++++++++++------------------
6 files changed, 121 insertions(+), 169 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0d393dc..9c3d004 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -186,7 +186,6 @@ struct mpage_da_data {
#define EXT4_IO_END_ERROR 0x0002
#define EXT4_IO_END_QUEUED 0x0004
#define EXT4_IO_END_DIRECT 0x0008
-#define EXT4_IO_END_IN_FSYNC 0x0010
struct ext4_io_page {
struct page *p_page;
@@ -2424,11 +2423,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* page-io.c */
extern int __init ext4_init_pageio(void);
+extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern int ext4_end_io_nolock(ext4_io_end_t *io);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 69e2d13..70ba122 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4607,7 +4607,9 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
}
/* finish any pending end_io work */
- ext4_flush_completed_IO(inode);
+ err = ext4_flush_completed_IO(inode);
+ if (err)
+ return err;
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, credits);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 323eb15..4600008 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,87 +34,6 @@
#include <trace/events/ext4.h>
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef EXT4FS_DEBUG
- struct list_head *cur, *before, *after;
- ext4_io_end_t *io, *io0, *io1;
- unsigned long flags;
-
- if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
- ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
- return;
- }
-
- ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
- cur = &io->list;
- before = cur->prev;
- io0 = container_of(before, ext4_io_end_t, list);
- after = cur->next;
- io1 = container_of(after, ext4_io_end_t, list);
-
- ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
- io, inode->i_ino, io0, io1);
- }
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int ext4_flush_completed_IO(struct inode *inode)
-{
- ext4_io_end_t *io;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long flags;
- int ret = 0;
- int ret2 = 0;
-
- dump_completed_IO(inode);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- while (!list_empty(&ei->i_completed_io_list)){
- io = list_entry(ei->i_completed_io_list.next,
- ext4_io_end_t, list);
- list_del_init(&io->list);
- io->flag |= EXT4_IO_END_IN_FSYNC;
- /*
- * Calling ext4_end_io_nolock() to convert completed
- * IO to written.
- *
- * When ext4_sync_file() is called, run_queue() may already
- * about to flush the work corresponding to this io structure.
- * It will be upset if it founds the io structure related
- * to the work-to-be schedule is freed.
- *
- * Thus we need to keep the io structure still valid here after
- * conversion finished. The io structure has a flag to
- * avoid double converting from both fsync and background work
- * queue work.
- */
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- ret = ext4_end_io_nolock(io);
- if (ret < 0)
- ret2 = ret;
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- io->flag &= ~EXT4_IO_END_IN_FSYNC;
- }
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- return (ret2 < 0) ? ret2 : 0;
-}
-
/*
* If we're not journaling and this is a just-created file, we have to
* sync our parent directory (if it was freshly created) since
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 9917c42..ebc7d83 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -808,11 +808,9 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
retry:
if (rw == READ && ext4_should_dioread_nolock(inode)) {
- if (unlikely(!list_empty(&ei->i_completed_io_list))) {
- mutex_lock(&inode->i_mutex);
+ if (unlikely(!list_empty(&ei->i_completed_io_list)))
ext4_flush_completed_IO(inode);
- mutex_unlock(&inode->i_mutex);
- }
+
ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3a28cf7..1ea34e5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2853,9 +2853,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
ext4_io_end_t *io_end = iocb->private;
- struct workqueue_struct *wq;
- unsigned long flags;
- struct ext4_inode_info *ei;
/* if not async direct IO or dio with 0 bytes write, just return */
if (!io_end || !size)
@@ -2884,24 +2881,14 @@ out:
io_end->iocb = iocb;
io_end->result = ret;
}
- wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
- /* Add the io_end to per-inode completed aio dio list*/
- ei = EXT4_I(io_end->inode);
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &ei->i_completed_io_list);
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
}
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
{
ext4_io_end_t *io_end = bh->b_private;
- struct workqueue_struct *wq;
struct inode *inode;
- unsigned long flags;
if (!test_clear_buffer_uninit(bh) || !io_end)
goto out;
@@ -2920,15 +2907,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
*/
inode = io_end->inode;
ext4_set_io_unwritten_flag(inode, io_end);
-
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-
- wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
- /* queue the work to convert unwritten extents to written */
- queue_work(wq, &io_end->work);
+ ext4_add_complete_io(io_end);
out:
bh->b_private = NULL;
bh->b_end_io = NULL;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 9970022..5b24c40 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -71,6 +71,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
int i;
BUG_ON(!io);
+ BUG_ON(!list_empty(&io->list));
BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
if (io->page)
@@ -83,21 +84,14 @@ void ext4_free_io_end(ext4_io_end_t *io)
kmem_cache_free(io_end_cachep, io);
}
-/*
- * check a range of space and convert unwritten extents to written.
- *
- * Called with inode->i_mutex; we depend on this when we manipulate
- * io->flag, since we could otherwise race with ext4_flush_completed_IO()
- */
-int ext4_end_io_nolock(ext4_io_end_t *io)
+/* check a range of space and convert unwritten extents to written. */
+static int ext4_end_io(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
ssize_t size = io->size;
int ret = 0;
- BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
-
ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);
@@ -110,7 +104,6 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
- io->flag &= ~EXT4_IO_END_UNWRITTEN;
if (io->iocb)
aio_complete(io->iocb, io->result, 0);
@@ -122,51 +115,122 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
return ret;
}
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
+static void dump_completed_IO(struct inode *inode)
+{
+#ifdef EXT4FS_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+ unsigned long flags;
+
+ if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
+ ext4_debug("inode %lu completed_io list is empty\n",
+ inode->i_ino);
+ return;
+ }
+
+ ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
+ list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
+ }
+#endif
+}
+
+/* Add the io_end to per-inode completed end_io list. */
+void ext4_add_complete_io(ext4_io_end_t *io_end)
{
- ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
- struct inode *inode = io->inode;
- struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long flags;
+ struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+ struct workqueue_struct *wq;
+ unsigned long flags;
+
+ BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (io->flag & EXT4_IO_END_IN_FSYNC)
- goto requeue;
- if (list_empty(&io->list)) {
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- goto free;
+ if (list_empty(&ei->i_completed_io_list)) {
+ io_end->flag |= EXT4_IO_END_QUEUED;
+ queue_work(wq, &io_end->work);
}
+ list_add_tail(&io_end->list, &ei->i_completed_io_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
- if (!mutex_trylock(&inode->i_mutex)) {
- bool was_queued;
-requeue:
- was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
- io->flag |= EXT4_IO_END_QUEUED;
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- /*
- * Requeue the work instead of waiting so that the work
- * items queued after this can be processed.
- */
- queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
- /*
- * To prevent the ext4-dio-unwritten thread from keeping
- * requeueing end_io requests and occupying cpu for too long,
- * yield the cpu if it sees an end_io request that has already
- * been requeued.
- */
- if (was_queued)
- yield();
- return;
+static int ext4_do_flush_completed_IO(struct inode *inode,
+ ext4_io_end_t *work_io)
+{
+ ext4_io_end_t *io;
+ struct list_head unwritten, complete, to_free;
+ unsigned long flags;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int err, ret = 0;
+
+ INIT_LIST_HEAD(&complete);
+ INIT_LIST_HEAD(&to_free);
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ dump_completed_IO(inode);
+ list_replace_init(&ei->i_completed_io_list, &unwritten);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+ while (!list_empty(&unwritten)) {
+ io = list_entry(unwritten.next, ext4_io_end_t, list);
+ BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
+ list_del_init(&io->list);
+
+ err = ext4_end_io(io);
+ if (unlikely(!ret && err))
+ ret = err;
+
+ list_add_tail(&io->list, &complete);
+ }
+ /* It is important to update all flags for all end_io in one shot w/o
+ * dropping the lock.*/
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ while (!list_empty(&complete)) {
+ io = list_entry(complete.next, ext4_io_end_t, list);
+ io->flag &= ~EXT4_IO_END_UNWRITTEN;
+ /* end_io context can not be destroyed now because it still
+ * used by queued worker. Worker thread will destroy it later */
+ if (io->flag & EXT4_IO_END_QUEUED)
+ list_del_init(&io->list);
+ else
+ list_move(&io->list, &to_free);
+ }
+ /* If we are called from worker context, it is time to clear queued
+ * flag, and destroy it's end_io if it was converted already */
+ if (work_io) {
+ work_io->flag &= ~EXT4_IO_END_QUEUED;
+ if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
+ list_add_tail(&work_io->list, &to_free);
}
- list_del_init(&io->list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
- (void) ext4_end_io_nolock(io);
- mutex_unlock(&inode->i_mutex);
-free:
- ext4_free_io_end(io);
+
+ while (!list_empty(&to_free)) {
+ io = list_entry(to_free.next, ext4_io_end_t, list);
+ list_del_init(&io->list);
+ ext4_free_io_end(io);
+ }
+ return ret;
+}
+
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ ext4_do_flush_completed_IO(io->inode, io);
+}
+
+int ext4_flush_completed_IO(struct inode *inode)
+{
+ return ext4_do_flush_completed_IO(inode, NULL);
}
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -199,9 +263,7 @@ static void buffer_io_error(struct buffer_head *bh)
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
- struct workqueue_struct *wq;
struct inode *inode;
- unsigned long flags;
int i;
sector_t bi_sector = bio->bi_sector;
@@ -259,14 +321,7 @@ static void ext4_end_bio(struct bio *bio, int error)
return;
}
- /* Add the io_end to per-inode completed io list*/
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
- spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
RFC_MESSAGE: It is up to committer whenever pick or drop this patch.
Only one user exist, so it may be resonable move it inside
caller's body. The only disadvantage is that makes end_do_flush_completed_IO()
less readable.
COMMIT_MESSAGE:
ext4_do_flush_completed_IO() is the only user of this function.
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/page-io.c | 56 ++++++++++++++++++++--------------------------------
1 files changed, 22 insertions(+), 34 deletions(-)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 5b24c40..0435688 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -84,37 +84,6 @@ void ext4_free_io_end(ext4_io_end_t *io)
kmem_cache_free(io_end_cachep, io);
}
-/* check a range of space and convert unwritten extents to written. */
-static int ext4_end_io(ext4_io_end_t *io)
-{
- struct inode *inode = io->inode;
- loff_t offset = io->offset;
- ssize_t size = io->size;
- int ret = 0;
-
- ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
- "list->prev 0x%p\n",
- io, inode->i_ino, io->list.next, io->list.prev);
-
- ret = ext4_convert_unwritten_extents(inode, offset, size);
- if (ret < 0) {
- ext4_msg(inode->i_sb, KERN_EMERG,
- "failed to convert unwritten extents to written "
- "extents -- potential data loss! "
- "(inode %lu, offset %llu, size %zd, error %d)",
- inode->i_ino, offset, size, ret);
- }
- if (io->iocb)
- aio_complete(io->iocb, io->result, 0);
-
- if (io->flag & EXT4_IO_END_DIRECT)
- inode_dio_done(inode);
- /* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
- wake_up_all(ext4_ioend_wq(io->inode));
- return ret;
-}
-
static void dump_completed_IO(struct inode *inode)
{
#ifdef EXT4FS_DEBUG
@@ -183,9 +152,28 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
list_del_init(&io->list);
- err = ext4_end_io(io);
- if (unlikely(!ret && err))
- ret = err;
+ ext4_debug("ext4_do_flush_completed_IO: io 0x%p, inode %lu\n",
+ io, inode->i_ino);
+
+ err = ext4_convert_unwritten_extents(inode, io->offset,
+ io->size);
+ if (err < 0) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "failed to convert unwritten extents to written"
+ " extents -- potential data loss! "
+ "(inode %lu, offset %llu, size %zd, error %d)",
+ inode->i_ino, io->offset, io->size, err);
+ if (!ret)
+ ret = err;
+ }
+ if (io->iocb)
+ aio_complete(io->iocb, io->result, 0);
+
+ if (io->flag & EXT4_IO_END_DIRECT)
+ inode_dio_done(inode);
+ /* Wake up anyone waiting on unwritten extent conversion */
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+ wake_up_all(ext4_ioend_wq(io->inode));
list_add_tail(&io->list, &complete);
}
--
1.7.7.6
Inode's block defrag and ext4_change_inode_journal_flag() may
affect nonlocked DIO reads result, so proper synchronization
required.
- Add missed inode_dio_wait() calls where appropriate
- Check inode state under extra i_dio_count reference.
Changes from V2:
- fix mistype in condition
Changes from V1:
- add missed memory bariers
- move DIOREAD_LOCK state check out from generic should_dioread_nolock
otherwise it will affect existing DIO readers.
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/ext4.h | 17 +++++++++++++++++
fs/ext4/indirect.c | 14 ++++++++++++++
fs/ext4/inode.c | 5 +++++
fs/ext4/move_extent.c | 8 ++++++++
4 files changed, 44 insertions(+), 0 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9c3d004..885dc79 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1363,6 +1363,8 @@ enum {
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
+ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
+ nolocking */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -2471,6 +2473,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
+/*
+ * Disable DIO read nolock optimization, so new dioreaders will be forced
+ * to grab i_mutex
+ */
+static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
+{
+ ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+ smp_mb();
+}
+static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
+{
+ smp_mb();
+ ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+}
+
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
/* For ioend & aio unwritten conversion wait queues */
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index ebc7d83..9e6cdb6 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -811,11 +811,25 @@ retry:
if (unlikely(!list_empty(&ei->i_completed_io_list)))
ext4_flush_completed_IO(inode);
+ /*
+ * Nolock dioread optimization may be dynamically disabled
+ * via ext4_inode_block_unlocked_dio(). Check inode's state
+ * while holding extra i_dio_count ref.
+ */
+ atomic_inc(&inode->i_dio_count);
+ smp_mb();
+ if (unlikely(ext4_test_inode_state(inode,
+ EXT4_STATE_DIOREAD_LOCK))) {
+ inode_dio_done(inode);
+ goto locked;
+ }
ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block, NULL, NULL, 0);
+ inode_dio_done(inode);
} else {
+locked:
ret = blockdev_direct_IO(rw, iocb, inode, iov,
offset, nr_segs, ext4_get_block);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1ea34e5..da52855 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4692,6 +4692,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
jbd2_journal_lock_updates(journal);
/*
@@ -4711,6 +4715,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
+ ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c2e47da..0aaaf12 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1325,6 +1325,12 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* Protect orig and donor inodes against a truncate */
mext_inode_double_lock(orig_inode, donor_inode);
+ /* Wait for all existing dio workers */
+ ext4_inode_block_unlocked_dio(orig_inode);
+ ext4_inode_block_unlocked_dio(donor_inode);
+ inode_dio_wait(orig_inode);
+ inode_dio_wait(donor_inode);
+
/* Protect extent tree against block allocations via delalloc */
double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
@@ -1523,6 +1529,8 @@ out:
kfree(holecheck_path);
}
double_up_write_data_sem(orig_inode, donor_inode);
+ ext4_inode_resume_unlocked_dio(orig_inode);
+ ext4_inode_resume_unlocked_dio(donor_inode);
mext_inode_double_unlock(orig_inode, donor_inode);
return ret;
--
1.7.7.6
Current serialization will works only for DIO which holds
i_mutex, but nonlocked DIO following race is possible:
dio_nolock_read_task truncate_task
->ext4_setattr()
->inode_dio_wait()
->ext4_ext_direct_IO
->ext4_ind_direct_IO
->__blockdev_direct_IO
->ext4_get_block
->truncate_setsize()
->ext4_truncate()
#alloc truncated blocks
#to other inode
->submit_io()
#INFORMATION LEAK
In order to serialize with unlocked DIO reads we have to
rearrange wait sequence
1) update i_size first
2) if i_size about to be reduced wait for outstanding DIO requests
3) and only after that truncate inode blocks
Reviewed-by: Jan Kara <[email protected]>
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/inode.c | 7 +++++--
1 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index da52855..583cb3f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4255,7 +4255,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- inode_dio_wait(inode);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4304,8 +4303,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- if (attr->ia_size != i_size_read(inode))
+ if (attr->ia_size != inode->i_size) {
truncate_setsize(inode, attr->ia_size);
+ /* Inode size will be reduced, wait for dio in flight */
+ if (orphan)
+ inode_dio_wait(inode);
+ }
ext4_truncate(inode);
}
--
1.7.7.6
ext4_set_io_unwritten_flag() will increment i_unwritten counter, so
once we mark end_io with EXT4_END_IO_UNWRITTEN we have to revert it back
on error path.
- add missed error checks to prevent counter leakage
- ext4_end_io_nolock() will clear EXT4_END_IO_UNWRITTEN flag to signal
that conversion finished.
- add BUG_ON to ext4_free_end_io() to prevent similar leakage in future.
Visible effect of this bug is that unaligned aio_stress may deadlock
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/extents.c | 21 ++++++++++++++-------
fs/ext4/page-io.c | 6 +++++-
2 files changed, 19 insertions(+), 8 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e9549f9..69e2d13 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3545,6 +3545,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
ret = ext4_split_unwritten_extents(handle, inode, map,
path, flags);
+ if (ret <= 0)
+ goto out;
/*
* Flag the inode(non aio case) or end_io struct (aio case)
* that this IO needs to conversion to written when IO is
@@ -3790,6 +3792,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_allocation_request ar;
ext4_io_end_t *io = ext4_inode_aio(inode);
ext4_lblk_t cluster_offset;
+ int set_unwritten = 0;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
@@ -4012,13 +4015,8 @@ got_allocated_blocks:
* For non asycn direct IO case, flag the inode state
* that we need to perform conversion when IO is done.
*/
- if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- if (io)
- ext4_set_io_unwritten_flag(inode, io);
- else
- ext4_set_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN);
- }
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO))
+ set_unwritten = 1;
if (ext4_should_dioread_nolock(inode))
map->m_flags |= EXT4_MAP_UNINIT;
}
@@ -4030,6 +4028,15 @@ got_allocated_blocks:
if (!err)
err = ext4_ext_insert_extent(handle, inode, path,
&newex, flags);
+
+ if (!err && set_unwritten) {
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
+ ext4_set_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN);
+ }
+
if (err && free_on_err) {
int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index de77e31..9970022 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -71,6 +71,8 @@ void ext4_free_io_end(ext4_io_end_t *io)
int i;
BUG_ON(!io);
+ BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
+
if (io->page)
put_page(io->page);
for (i = 0; i < io->num_io_pages; i++)
@@ -94,6 +96,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
ssize_t size = io->size;
int ret = 0;
+ BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
+
ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);
@@ -106,7 +110,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
Inode is allowed to have empty leaf only if it this is blockless inode.
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/extents.c | 16 +++++++++-------
1 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a1d16eb..5ed6e86 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2589,7 +2589,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct ext4_ext_path *path = NULL;
ext4_fsblk_t partial_cluster = 0;
handle_t *handle;
- int i = 0, err;
+ int i = 0, err = 0;
ext_debug("truncate since %u to %u\n", start, end);
@@ -2621,12 +2621,16 @@ again:
return PTR_ERR(path);
}
depth = ext_depth(inode);
+ /* Leaf not may not exist only if inode has no blocks at all */
ex = path[depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
- path = NULL;
- goto cont;
+ if (depth) {
+ EXT4_ERROR_INODE(inode,
+ "path[%d].p_hdr == NULL",
+ depth);
+ err = -EIO;
+ }
+ goto out;
}
ee_block = le32_to_cpu(ex->ee_block);
@@ -2658,8 +2662,6 @@ again:
goto out;
}
}
-cont:
If we have enough aggressive DIO readers, truncate and other dio
waiters will wait forever inside inode_dio_wait(). It is reasonable
to disable nonlock DIO read optimization during truncate.
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/inode.c | 9 +++++++--
1 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 583cb3f..c40a98d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4305,9 +4305,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & ATTR_SIZE) {
if (attr->ia_size != inode->i_size) {
truncate_setsize(inode, attr->ia_size);
- /* Inode size will be reduced, wait for dio in flight */
- if (orphan)
+ /* Inode size will be reduced, wait for dio in flight.
+ * Temporarily disable dioread_nolock to prevent
+ * livelock. */
+ if (orphan) {
+ ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ }
}
ext4_truncate(inode);
}
--
1.7.7.6
punch_hole is the place where we have to wait for all existing writers
(writeback, aio, dio), but currently we simply flush pended end_io request
which is not sufficient. Other issue is that punch_hole performed w/o i_mutex
held which obviously result in dangerous data corruption due to
write-after-free.
This patch performs following changes:
- Guard punch_hole with i_mutex
- Recheck inode flags under i_mutex
- Block all new dio readers in order to prevent information leak caused by
read-after-free pattern.
- punch_hole now wait for all writers in flight
NOTE: XXX write-after-free race is still possible because new dirty pages
may appear due to mmap(), and currently there is no easy way to stop
writeback while punch_hole is in progress.
Changes from V1:
Add flag checks once we hold i_mutex
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/extents.c | 50 +++++++++++++++++++++++++++++++++-----------------
1 files changed, 33 insertions(+), 17 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 70ba122..a1d16eb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4568,9 +4568,29 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
loff_t first_page_offset, last_page_offset;
int credits, err = 0;
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ err = filemap_write_and_wait_range(mapping,
+ offset, offset + length - 1);
+
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ /* Need recheck file flags under mutex */
+ /* It's not possible punch hole on append only file */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return -EPERM;
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
- return 0;
+ goto out_mutex;
/*
* If the hole extends beyond i_size, set the hole
@@ -4588,33 +4608,25 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
first_page_offset = first_page << PAGE_CACHE_SHIFT;
last_page_offset = last_page << PAGE_CACHE_SHIFT;
- /*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- err = filemap_write_and_wait_range(mapping,
- offset, offset + length - 1);
-
- if (err)
- return err;
- }
-
/* Now release the pages */
if (last_page_offset > first_page_offset) {
truncate_pagecache_range(inode, first_page_offset,
last_page_offset - 1);
}
- /* finish any pending end_io work */
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
err = ext4_flush_completed_IO(inode);
if (err)
- return err;
+ goto out_dio;
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_dio;
+ }
/*
@@ -4706,6 +4718,10 @@ out:
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
return err;
}
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
--
1.7.7.6
Jan Kara have spotted interesting issue:
There are potential data corruption issue with direct IO overwrites
racing with truncate:
Like:
dio write truncate_task
->ext4_ext_direct_IO
->overwrite == 1
->down_read(&EXT4_I(inode)->i_data_sem);
->mutex_unlock(&inode->i_mutex);
->ext4_setattr()
->inode_dio_wait()
->truncate_setsize()
->ext4_truncate()
->down_write(&EXT4_I(inode)->i_data_sem);
->__blockdev_direct_IO
->ext4_get_block
->submit_io()
->up_read(&EXT4_I(inode)->i_data_sem);
# truncate data blocks, allocate them to
# other inode - bad stuff happens because
# dio is still in flight.
In order to serialize with truncate dio worker should grab extra i_dio_count
reference before drop i_mutex.
Changes agains V1:
- wake up dio waiters before i_mutex.
Reviewed-by: Jan Kara <[email protected]>
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/inode.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c40a98d..14b7096 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2982,6 +2982,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
overwrite = *((int *)iocb->private);
if (overwrite) {
+ atomic_inc(&inode->i_dio_count);
down_read(&EXT4_I(inode)->i_data_sem);
mutex_unlock(&inode->i_mutex);
}
@@ -3079,6 +3080,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
retake_lock:
/* take i_mutex locking again if we do a ovewrite dio */
if (overwrite) {
+ inode_dio_done(inode);
up_read(&EXT4_I(inode)->i_data_sem);
mutex_lock(&inode->i_mutex);
}
--
1.7.7.6
On Fri, Sep 28, 2012 at 07:44:07PM +0400, Dmitry Monakhov wrote:
> if (attr->ia_valid & ATTR_SIZE) {
> - if (attr->ia_size != i_size_read(inode))
> + if (attr->ia_size != inode->i_size) {
Why this change? Why are you not using i_size_read() here?
- Ted
On Sat, 29 Sep 2012 00:49:04 -0400, "Theodore Ts'o" <[email protected]> wrote:
> On Fri, Sep 28, 2012 at 07:44:07PM +0400, Dmitry Monakhov wrote:
> > if (attr->ia_valid & ATTR_SIZE) {
> > - if (attr->ia_size != i_size_read(inode))
> > + if (attr->ia_size != inode->i_size) {
>
> Why this change? Why are you not using i_size_read() here?
Since we hold i_mutex no one is able to change i_size, and it
is safe to access it directly.
>
> - Ted
On Fri 28-09-12 19:44:01, Dmitry Monakhov wrote:
> Generic inode has unused i_private pointer which may be used as cur_aio_dio
> storage.
>
> TODO: If cur_aio_dio will be passed as an argument to get_block_t this allow
> to have concurent AIO_DIO requests.
>
> Reviewed-by: Zheng Liu <[email protected]>
> Signed-off-by: Dmitry Monakhov <[email protected]>
Looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> ---
> fs/ext4/ext4.h | 12 ++++++++++--
> fs/ext4/extents.c | 4 ++--
> fs/ext4/inode.c | 6 +++---
> fs/ext4/super.c | 1 -
> 4 files changed, 15 insertions(+), 8 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 6f37c11..42be3ae 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -917,8 +917,6 @@ struct ext4_inode_info {
> struct list_head i_completed_io_list;
> spinlock_t i_completed_io_lock;
> atomic_t i_ioend_count; /* Number of outstanding io_end structs */
> - /* current io_end structure for async DIO write*/
> - ext4_io_end_t *cur_aio_dio;
> atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
>
> spinlock_t i_block_reservation_lock;
> @@ -1343,6 +1341,16 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
> }
> }
>
> +static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
> +{
> + return inode->i_private;
> +}
> +
> +static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
> +{
> + inode->i_private = io;
> +}
> +
> /*
> * Inode dynamic state flags
> */
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index c29379d..e9549f9 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -3530,7 +3530,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
> {
> int ret = 0;
> int err = 0;
> - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
> + ext4_io_end_t *io = ext4_inode_aio(inode);
>
> ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
> "block %llu, max_blocks %u, flags %x, allocated %u\n",
> @@ -3788,7 +3788,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
> unsigned int allocated = 0, offset = 0;
> unsigned int allocated_clusters = 0;
> struct ext4_allocation_request ar;
> - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
> + ext4_io_end_t *io = ext4_inode_aio(inode);
> ext4_lblk_t cluster_offset;
>
> ext_debug("blocks %u/%u requested for inode %lu\n",
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index ad568b8..3a28cf7 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3028,7 +3028,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> * hook to the iocb.
> */
> iocb->private = NULL;
> - EXT4_I(inode)->cur_aio_dio = NULL;
> + ext4_inode_aio_set(inode, NULL);
> if (!is_sync_kiocb(iocb)) {
> ext4_io_end_t *io_end =
> ext4_init_io_end(inode, GFP_NOFS);
> @@ -3045,7 +3045,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> * is a unwritten extents needs to be converted
> * when IO is completed.
> */
> - EXT4_I(inode)->cur_aio_dio = iocb->private;
> + ext4_inode_aio_set(inode, io_end);
> }
>
> if (overwrite)
> @@ -3065,7 +3065,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> NULL,
> DIO_LOCKING);
> if (iocb->private)
> - EXT4_I(inode)->cur_aio_dio = NULL;
> + ext4_inode_aio_set(inode, NULL);
> /*
> * The io_end structure takes a reference to the inode,
> * that structure needs to be destroyed and the
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index d5f4c97..b5dfff8 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -967,7 +967,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
> ei->jinode = NULL;
> INIT_LIST_HEAD(&ei->i_completed_io_list);
> spin_lock_init(&ei->i_completed_io_lock);
> - ei->cur_aio_dio = NULL;
> ei->i_sync_tid = 0;
> ei->i_datasync_tid = 0;
> atomic_set(&ei->i_ioend_count, 0);
> --
> 1.7.7.6
>
On Fri 28-09-12 19:44:03, Dmitry Monakhov wrote:
> ext4_set_io_unwritten_flag() will increment i_unwritten counter, so
> once we mark end_io with EXT4_END_IO_UNWRITTEN we have to revert it back
> on error path.
>
> - add missed error checks to prevent counter leakage
> - ext4_end_io_nolock() will clear EXT4_END_IO_UNWRITTEN flag to signal
> that conversion finished.
> - add BUG_ON to ext4_free_end_io() to prevent similar leakage in future.
>
> Visible effect of this bug is that unaligned aio_stress may deadlock
Looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
>
> Signed-off-by: Dmitry Monakhov <[email protected]>
> ---
> fs/ext4/extents.c | 21 ++++++++++++++-------
> fs/ext4/page-io.c | 6 +++++-
> 2 files changed, 19 insertions(+), 8 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index e9549f9..69e2d13 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -3545,6 +3545,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
> if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
> ret = ext4_split_unwritten_extents(handle, inode, map,
> path, flags);
> + if (ret <= 0)
> + goto out;
> /*
> * Flag the inode(non aio case) or end_io struct (aio case)
> * that this IO needs to conversion to written when IO is
> @@ -3790,6 +3792,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
> struct ext4_allocation_request ar;
> ext4_io_end_t *io = ext4_inode_aio(inode);
> ext4_lblk_t cluster_offset;
> + int set_unwritten = 0;
>
> ext_debug("blocks %u/%u requested for inode %lu\n",
> map->m_lblk, map->m_len, inode->i_ino);
> @@ -4012,13 +4015,8 @@ got_allocated_blocks:
> * For non asycn direct IO case, flag the inode state
> * that we need to perform conversion when IO is done.
> */
> - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
> - if (io)
> - ext4_set_io_unwritten_flag(inode, io);
> - else
> - ext4_set_inode_state(inode,
> - EXT4_STATE_DIO_UNWRITTEN);
> - }
> + if ((flags & EXT4_GET_BLOCKS_PRE_IO))
> + set_unwritten = 1;
> if (ext4_should_dioread_nolock(inode))
> map->m_flags |= EXT4_MAP_UNINIT;
> }
> @@ -4030,6 +4028,15 @@ got_allocated_blocks:
> if (!err)
> err = ext4_ext_insert_extent(handle, inode, path,
> &newex, flags);
> +
> + if (!err && set_unwritten) {
> + if (io)
> + ext4_set_io_unwritten_flag(inode, io);
> + else
> + ext4_set_inode_state(inode,
> + EXT4_STATE_DIO_UNWRITTEN);
> + }
> +
> if (err && free_on_err) {
> int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
> EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index de77e31..9970022 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -71,6 +71,8 @@ void ext4_free_io_end(ext4_io_end_t *io)
> int i;
>
> BUG_ON(!io);
> + BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
> +
> if (io->page)
> put_page(io->page);
> for (i = 0; i < io->num_io_pages; i++)
> @@ -94,6 +96,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
> ssize_t size = io->size;
> int ret = 0;
>
> + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
> +
> ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
> "list->prev 0x%p\n",
> io, inode->i_ino, io->list.next, io->list.prev);
> @@ -106,7 +110,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
> "(inode %lu, offset %llu, size %zd, error %d)",
> inode->i_ino, offset, size, ret);
> }
> -
> + io->flag &= ~EXT4_IO_END_UNWRITTEN;
> if (io->iocb)
> aio_complete(io->iocb, io->result, 0);
>
> --
> 1.7.7.6
>
On Fri 28-09-12 19:44:06, Dmitry Monakhov wrote:
> Inode's block defrag and ext4_change_inode_journal_flag() may
> affect nonlocked DIO reads result, so proper synchronization
> required.
>
> - Add missed inode_dio_wait() calls where appropriate
> - Check inode state under extra i_dio_count reference.
Looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
>
> Changes from V2:
> - fix mistype in condition
> Changes from V1:
> - add missed memory bariers
> - move DIOREAD_LOCK state check out from generic should_dioread_nolock
> otherwise it will affect existing DIO readers.
>
> Signed-off-by: Dmitry Monakhov <[email protected]>
> ---
> fs/ext4/ext4.h | 17 +++++++++++++++++
> fs/ext4/indirect.c | 14 ++++++++++++++
> fs/ext4/inode.c | 5 +++++
> fs/ext4/move_extent.c | 8 ++++++++
> 4 files changed, 44 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 9c3d004..885dc79 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1363,6 +1363,8 @@ enum {
> EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
> EXT4_STATE_NEWENTRY, /* File just added to dir */
> EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
> + EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
> + nolocking */
> };
>
> #define EXT4_INODE_BIT_FNS(name, field, offset) \
> @@ -2471,6 +2473,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
> set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
> }
>
> +/*
> + * Disable DIO read nolock optimization, so new dioreaders will be forced
> + * to grab i_mutex
> + */
> +static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
> +{
> + ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
> + smp_mb();
> +}
> +static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
> +{
> + smp_mb();
> + ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
> +}
> +
> #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
>
> /* For ioend & aio unwritten conversion wait queues */
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index ebc7d83..9e6cdb6 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -811,11 +811,25 @@ retry:
> if (unlikely(!list_empty(&ei->i_completed_io_list)))
> ext4_flush_completed_IO(inode);
>
> + /*
> + * Nolock dioread optimization may be dynamically disabled
> + * via ext4_inode_block_unlocked_dio(). Check inode's state
> + * while holding extra i_dio_count ref.
> + */
> + atomic_inc(&inode->i_dio_count);
> + smp_mb();
> + if (unlikely(ext4_test_inode_state(inode,
> + EXT4_STATE_DIOREAD_LOCK))) {
> + inode_dio_done(inode);
> + goto locked;
> + }
> ret = __blockdev_direct_IO(rw, iocb, inode,
> inode->i_sb->s_bdev, iov,
> offset, nr_segs,
> ext4_get_block, NULL, NULL, 0);
> + inode_dio_done(inode);
> } else {
> +locked:
> ret = blockdev_direct_IO(rw, iocb, inode, iov,
> offset, nr_segs, ext4_get_block);
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 1ea34e5..da52855 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4692,6 +4692,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
> return err;
> }
>
> + /* Wait for all existing dio workers */
> + ext4_inode_block_unlocked_dio(inode);
> + inode_dio_wait(inode);
> +
> jbd2_journal_lock_updates(journal);
>
> /*
> @@ -4711,6 +4715,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
> ext4_set_aops(inode);
>
> jbd2_journal_unlock_updates(journal);
> + ext4_inode_resume_unlocked_dio(inode);
>
> /* Finally we can mark the inode as dirty. */
>
> diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
> index c2e47da..0aaaf12 100644
> --- a/fs/ext4/move_extent.c
> +++ b/fs/ext4/move_extent.c
> @@ -1325,6 +1325,12 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
> /* Protect orig and donor inodes against a truncate */
> mext_inode_double_lock(orig_inode, donor_inode);
>
> + /* Wait for all existing dio workers */
> + ext4_inode_block_unlocked_dio(orig_inode);
> + ext4_inode_block_unlocked_dio(donor_inode);
> + inode_dio_wait(orig_inode);
> + inode_dio_wait(donor_inode);
> +
> /* Protect extent tree against block allocations via delalloc */
> double_down_write_data_sem(orig_inode, donor_inode);
> /* Check the filesystem environment whether move_extent can be done */
> @@ -1523,6 +1529,8 @@ out:
> kfree(holecheck_path);
> }
> double_up_write_data_sem(orig_inode, donor_inode);
> + ext4_inode_resume_unlocked_dio(orig_inode);
> + ext4_inode_resume_unlocked_dio(donor_inode);
> mext_inode_double_unlock(orig_inode, donor_inode);
>
> return ret;
> --
> 1.7.7.6
>
On Fri 28-09-12 19:44:08, Dmitry Monakhov wrote:
> If we have enough aggressive DIO readers, truncate and other dio
> waiters will wait forever inside inode_dio_wait(). It is reasonable
> to disable nonlock DIO read optimization during truncate.
OK, as per our previous discussion I agree. So you can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
>
> Signed-off-by: Dmitry Monakhov <[email protected]>
> ---
> fs/ext4/inode.c | 9 +++++++--
> 1 files changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 583cb3f..c40a98d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4305,9 +4305,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
> if (attr->ia_valid & ATTR_SIZE) {
> if (attr->ia_size != inode->i_size) {
> truncate_setsize(inode, attr->ia_size);
> - /* Inode size will be reduced, wait for dio in flight */
> - if (orphan)
> + /* Inode size will be reduced, wait for dio in flight.
> + * Temporarily disable dioread_nolock to prevent
> + * livelock. */
> + if (orphan) {
> + ext4_inode_block_unlocked_dio(inode);
> inode_dio_wait(inode);
> + ext4_inode_resume_unlocked_dio(inode);
> + }
> }
> ext4_truncate(inode);
> }
> --
> 1.7.7.6
>
On Fri 28-09-12 19:44:10, Dmitry Monakhov wrote:
> punch_hole is the place where we have to wait for all existing writers
> (writeback, aio, dio), but currently we simply flush pended end_io request
> which is not sufficient. Other issue is that punch_hole performed w/o i_mutex
> held which obviously result in dangerous data corruption due to
> write-after-free.
>
> This patch performs following changes:
> - Guard punch_hole with i_mutex
> - Recheck inode flags under i_mutex
> - Block all new dio readers in order to prevent information leak caused by
> read-after-free pattern.
> - punch_hole now wait for all writers in flight
> NOTE: XXX write-after-free race is still possible because new dirty pages
> may appear due to mmap(), and currently there is no easy way to stop
> writeback while punch_hole is in progress.
The patch looks good. Just one nit: The label 'out' in
ext4_ext_punch_hole() is now named contrary to common scheme where 'out' is
the outermost of labels. So renaming that to something like 'out_orphan'
would be good. Besides this you can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
>
> Changes from V1:
> Add flag checks once we hold i_mutex
>
> Signed-off-by: Dmitry Monakhov <[email protected]>
> ---
> fs/ext4/extents.c | 50 +++++++++++++++++++++++++++++++++-----------------
> 1 files changed, 33 insertions(+), 17 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 70ba122..a1d16eb 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4568,9 +4568,29 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
> loff_t first_page_offset, last_page_offset;
> int credits, err = 0;
>
> + /*
> + * Write out all dirty pages to avoid race conditions
> + * Then release them.
> + */
> + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
> + err = filemap_write_and_wait_range(mapping,
> + offset, offset + length - 1);
> +
> + if (err)
> + return err;
> + }
> +
> + mutex_lock(&inode->i_mutex);
> + /* Need recheck file flags under mutex */
> + /* It's not possible punch hole on append only file */
> + if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
> + return -EPERM;
> + if (IS_SWAPFILE(inode))
> + return -ETXTBSY;
> +
> /* No need to punch hole beyond i_size */
> if (offset >= inode->i_size)
> - return 0;
> + goto out_mutex;
>
> /*
> * If the hole extends beyond i_size, set the hole
> @@ -4588,33 +4608,25 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
> first_page_offset = first_page << PAGE_CACHE_SHIFT;
> last_page_offset = last_page << PAGE_CACHE_SHIFT;
>
> - /*
> - * Write out all dirty pages to avoid race conditions
> - * Then release them.
> - */
> - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
> - err = filemap_write_and_wait_range(mapping,
> - offset, offset + length - 1);
> -
> - if (err)
> - return err;
> - }
> -
> /* Now release the pages */
> if (last_page_offset > first_page_offset) {
> truncate_pagecache_range(inode, first_page_offset,
> last_page_offset - 1);
> }
>
> - /* finish any pending end_io work */
> + /* Wait all existing dio workers, newcomers will block on i_mutex */
> + ext4_inode_block_unlocked_dio(inode);
> + inode_dio_wait(inode);
> err = ext4_flush_completed_IO(inode);
> if (err)
> - return err;
> + goto out_dio;
>
> credits = ext4_writepage_trans_blocks(inode);
> handle = ext4_journal_start(inode, credits);
> - if (IS_ERR(handle))
> - return PTR_ERR(handle);
> + if (IS_ERR(handle)) {
> + err = PTR_ERR(handle);
> + goto out_dio;
> + }
>
>
> /*
> @@ -4706,6 +4718,10 @@ out:
> inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
> ext4_mark_inode_dirty(handle, inode);
> ext4_journal_stop(handle);
> +out_dio:
> + ext4_inode_resume_unlocked_dio(inode);
> +out_mutex:
> + mutex_unlock(&inode->i_mutex);
> return err;
> }
> int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> --
> 1.7.7.6
>
On Fri 28-09-12 19:44:04, Dmitry Monakhov wrote:
Couple of language fixes:
> Current unwritten extent conversion state-machine is very fuzzy.
> - By unknown reason it want perform conversion under i_mutex. What for?
^^ For ^^^^^^^^ I'd just make it "performs"
> My diagnosis:
> We already protect extent tree with i_data_sem, truncate and punch_hole
> should wait for DIO, so the only data we have to protect is end_io->flags
> modification, but only flush_completed_IO and end_io_work modified this
^^ these
> flags and we can serialize them via i_completed_io_lock.
>
> Currently all this games with mutex_trylock result in following deadlock
^^^ these ^ the
> truncate: kworker:
> ext4_setattr ext4_end_io_work
> mutex_lock(i_mutex)
> inode_dio_wait(inode) ->BLOCK
> DEADLOCK<- mutex_trylock()
> inode_dio_done()
> #TEST_CASE1_BEGIN
> MNT=/mnt_scrach
> unlink $MNT/file
> fallocate -l $((1024*1024*1024)) $MNT/file
> aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
> sleep 2
> truncate -s 0 $MNT/file
> #TEST_CASE1_END
>
> Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286
>
> This patch makes state machine simple and clean:
>
> (1) xxx_end_io schedule final extent conversion simply by calling
> ext4_add_complete_io(), which append it to ei->i_completed_io_list
> NOTE1: because of (2A) work should be queued only if
> ->i_completed_io_list was empty, otherwise it work is scheduled already.
^^ remove this
>
> (2) ext4_flush_completed_IO is responsible for handling all pending
> end_io from ei->i_completed_io_list
> Flushing sequange consist of following stages:
^^ sequence ^ consists
> A) LOCKED: Atomically drain completed_io_list to local_list
> B) Perform extents conversion
> C) LOCKED: move conveted io's to to_free list for final deletion
^^ converted
> This logic depends on context which we was called from.
^^ were
> D) Final end_io context destruction
> NOTE1: i_mutex is no longer required because end_io->flags modification
> protected by ei->ext4_complete_io_lock
^^ is protected
>
> Full list of changes:
> - Move all completion end_io related routines to page-io.c in order to improve
> logic locality
> - Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
> - remove EXT4_IO_END_FSYNC
> - Improve SMP scalability by removing useless i_mutex which does not
> protect io->flags anymore.
> - Reduce lock contention on i_completed_io_lock by optimizing list walk.
> - Rename ext4_end_io_nolock to end4_end_io and make it static
> - Check flush completion status to ext4_ext_punch_hole(). Because it is
> not good idea to punch blocks from corrupted inode.
>
> Changes since V3 (in request to Jan's comments):
> Fall back to active flush_completed_IO() approach in order to prevent
> performance issues with nolocked DIO reads.
> Changes since V2:
> Fix use-after-free caused by race truncate vs end_io_work
>
> Signed-off-by: Dmitry Monakhov <[email protected]>
> ---
> fs/ext4/ext4.h | 3 +-
> fs/ext4/extents.c | 4 +-
> fs/ext4/fsync.c | 81 -------------------------
> fs/ext4/indirect.c | 6 +-
> fs/ext4/inode.c | 25 +-------
> fs/ext4/page-io.c | 171 ++++++++++++++++++++++++++++++++++------------------
> 6 files changed, 121 insertions(+), 169 deletions(-)
>
...
> +static int ext4_do_flush_completed_IO(struct inode *inode,
> + ext4_io_end_t *work_io)
> +{
> + ext4_io_end_t *io;
> + struct list_head unwritten, complete, to_free;
> + unsigned long flags;
> + struct ext4_inode_info *ei = EXT4_I(inode);
> + int err, ret = 0;
> +
> + INIT_LIST_HEAD(&complete);
> + INIT_LIST_HEAD(&to_free);
> +
> + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> + dump_completed_IO(inode);
> + list_replace_init(&ei->i_completed_io_list, &unwritten);
> + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> +
> + while (!list_empty(&unwritten)) {
> + io = list_entry(unwritten.next, ext4_io_end_t, list);
> + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
> + list_del_init(&io->list);
> +
> + err = ext4_end_io(io);
> + if (unlikely(!ret && err))
> + ret = err;
> +
> + list_add_tail(&io->list, &complete);
> + }
> + /* It is important to update all flags for all end_io in one shot w/o
> + * dropping the lock.*/
Why? It would seem that once io structures are moved from
i_completed_io_list, they are unreachable by anyone else?
> + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> + while (!list_empty(&complete)) {
> + io = list_entry(complete.next, ext4_io_end_t, list);
> + io->flag &= ~EXT4_IO_END_UNWRITTEN;
> + /* end_io context can not be destroyed now because it still
> + * used by queued worker. Worker thread will destroy it later */
> + if (io->flag & EXT4_IO_END_QUEUED)
> + list_del_init(&io->list);
> + else
> + list_move(&io->list, &to_free);
> + }
> + /* If we are called from worker context, it is time to clear queued
> + * flag, and destroy it's end_io if it was converted already */
> + if (work_io) {
> + work_io->flag &= ~EXT4_IO_END_QUEUED;
> + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
> + list_add_tail(&work_io->list, &to_free);
> }
> - list_del_init(&io->list);
> spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> - (void) ext4_end_io_nolock(io);
> - mutex_unlock(&inode->i_mutex);
> -free:
> - ext4_free_io_end(io);
> +
> + while (!list_empty(&to_free)) {
> + io = list_entry(to_free.next, ext4_io_end_t, list);
> + list_del_init(&io->list);
> + ext4_free_io_end(io);
> + }
> + return ret;
> +}
> +
> +/*
> + * work on completed aio dio IO, to convert unwritten extents to extents
> + */
> +static void ext4_end_io_work(struct work_struct *work)
> +{
> + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> + ext4_do_flush_completed_IO(io->inode, io);
> +}
> +
> +int ext4_flush_completed_IO(struct inode *inode)
> +{
> + return ext4_do_flush_completed_IO(inode, NULL);
> }
Also it seems that when ext4_flush_completed_IO() is called, workqueue
can have several IO structures queued in its local lists thus we miss them
here and don't properly wait for all conversions?
Honza
On Mon, 1 Oct 2012 20:38:33 +0200, Jan Kara <[email protected]> wrote:
> On Fri 28-09-12 19:44:04, Dmitry Monakhov wrote:
> Couple of language fixes:
> > Current unwritten extent conversion state-machine is very fuzzy.
> > - By unknown reason it want perform conversion under i_mutex. What for?
> ^^ For ^^^^^^^^ I'd just make it "performs"
> > My diagnosis:
> > We already protect extent tree with i_data_sem, truncate and punch_hole
> > should wait for DIO, so the only data we have to protect is end_io->flags
> > modification, but only flush_completed_IO and end_io_work modified this
> ^^ these
> > flags and we can serialize them via i_completed_io_lock.
> >
> > Currently all this games with mutex_trylock result in following deadlock
> ^^^ these ^ the
> > truncate: kworker:
> > ext4_setattr ext4_end_io_work
> > mutex_lock(i_mutex)
> > inode_dio_wait(inode) ->BLOCK
> > DEADLOCK<- mutex_trylock()
> > inode_dio_done()
> > #TEST_CASE1_BEGIN
> > MNT=/mnt_scrach
> > unlink $MNT/file
> > fallocate -l $((1024*1024*1024)) $MNT/file
> > aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
> > sleep 2
> > truncate -s 0 $MNT/file
> > #TEST_CASE1_END
> >
> > Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286
> >
> > This patch makes state machine simple and clean:
> >
> > (1) xxx_end_io schedule final extent conversion simply by calling
> > ext4_add_complete_io(), which append it to ei->i_completed_io_list
> > NOTE1: because of (2A) work should be queued only if
> > ->i_completed_io_list was empty, otherwise it work is scheduled already.
> ^^ remove this
> >
> > (2) ext4_flush_completed_IO is responsible for handling all pending
> > end_io from ei->i_completed_io_list
> > Flushing sequange consist of following stages:
> ^^ sequence ^ consists
> > A) LOCKED: Atomically drain completed_io_list to local_list
> > B) Perform extents conversion
> > C) LOCKED: move conveted io's to to_free list for final deletion
> ^^ converted
> > This logic depends on context which we was called from.
> ^^ were
> > D) Final end_io context destruction
> > NOTE1: i_mutex is no longer required because end_io->flags modification
> > protected by ei->ext4_complete_io_lock
> ^^ is protected
> >
> > Full list of changes:
> > - Move all completion end_io related routines to page-io.c in order to improve
> > logic locality
> > - Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
> > - remove EXT4_IO_END_FSYNC
> > - Improve SMP scalability by removing useless i_mutex which does not
> > protect io->flags anymore.
> > - Reduce lock contention on i_completed_io_lock by optimizing list walk.
> > - Rename ext4_end_io_nolock to end4_end_io and make it static
> > - Check flush completion status to ext4_ext_punch_hole(). Because it is
> > not good idea to punch blocks from corrupted inode.
> >
> > Changes since V3 (in request to Jan's comments):
> > Fall back to active flush_completed_IO() approach in order to prevent
> > performance issues with nolocked DIO reads.
> > Changes since V2:
> > Fix use-after-free caused by race truncate vs end_io_work
> >
> > Signed-off-by: Dmitry Monakhov <[email protected]>
> > ---
> > fs/ext4/ext4.h | 3 +-
> > fs/ext4/extents.c | 4 +-
> > fs/ext4/fsync.c | 81 -------------------------
> > fs/ext4/indirect.c | 6 +-
> > fs/ext4/inode.c | 25 +-------
> > fs/ext4/page-io.c | 171 ++++++++++++++++++++++++++++++++++------------------
> > 6 files changed, 121 insertions(+), 169 deletions(-)
> >
> ...
> > +static int ext4_do_flush_completed_IO(struct inode *inode,
> > + ext4_io_end_t *work_io)
> > +{
> > + ext4_io_end_t *io;
> > + struct list_head unwritten, complete, to_free;
> > + unsigned long flags;
> > + struct ext4_inode_info *ei = EXT4_I(inode);
> > + int err, ret = 0;
> > +
> > + INIT_LIST_HEAD(&complete);
> > + INIT_LIST_HEAD(&to_free);
> > +
> > + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > + dump_completed_IO(inode);
> > + list_replace_init(&ei->i_completed_io_list, &unwritten);
> > + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> > +
> > + while (!list_empty(&unwritten)) {
> > + io = list_entry(unwritten.next, ext4_io_end_t, list);
> > + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
> > + list_del_init(&io->list);
> > +
> > + err = ext4_end_io(io);
> > + if (unlikely(!ret && err))
> > + ret = err;
> > +
> > + list_add_tail(&io->list, &complete);
> > + }
> > + /* It is important to update all flags for all end_io in one shot w/o
> > + * dropping the lock.*/
> Why? It would seem that once io structures are moved from
> i_completed_io_list, they are unreachable by anyone else?
>
> > + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > + while (!list_empty(&complete)) {
> > + io = list_entry(complete.next, ext4_io_end_t, list);
> > + io->flag &= ~EXT4_IO_END_UNWRITTEN;
> > + /* end_io context can not be destroyed now because it still
> > + * used by queued worker. Worker thread will destroy it later */
> > + if (io->flag & EXT4_IO_END_QUEUED)
> > + list_del_init(&io->list);
> > + else
> > + list_move(&io->list, &to_free);
> > + }
> > + /* If we are called from worker context, it is time to clear queued
> > + * flag, and destroy it's end_io if it was converted already */
> > + if (work_io) {
> > + work_io->flag &= ~EXT4_IO_END_QUEUED;
> > + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
> > + list_add_tail(&work_io->list, &to_free);
> > }
> > - list_del_init(&io->list);
> > spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> > - (void) ext4_end_io_nolock(io);
> > - mutex_unlock(&inode->i_mutex);
> > -free:
> > - ext4_free_io_end(io);
> > +
> > + while (!list_empty(&to_free)) {
> > + io = list_entry(to_free.next, ext4_io_end_t, list);
> > + list_del_init(&io->list);
> > + ext4_free_io_end(io);
> > + }
> > + return ret;
> > +}
> > +
> > +/*
> > + * work on completed aio dio IO, to convert unwritten extents to extents
> > + */
> > +static void ext4_end_io_work(struct work_struct *work)
> > +{
> > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > + ext4_do_flush_completed_IO(io->inode, io);
> > +}
> > +
> > +int ext4_flush_completed_IO(struct inode *inode)
> > +{
> > + return ext4_do_flush_completed_IO(inode, NULL);
> > }
> Also it seems that when ext4_flush_completed_IO() is called, workqueue
> can have several IO structures queued in its local lists thus we miss them
> here and don't properly wait for all conversions?
No it is not. Because list drained atomically, and
add_complete_io will queue work only if list is empty.
Race between conversion and dequeue-process is not possible because
we hold lock during entire walk of complete_list, so from external
point of view we mark list as conversed(clear unwritten flag)
happens atomically. I've drawn all possible situations and race not
happen. If you know any please let me know.
>
> Honza
On Tue 02-10-12 11:16:38, Dmitry Monakhov wrote:
...
> > > +static int ext4_do_flush_completed_IO(struct inode *inode,
> > > + while (!list_empty(&unwritten)) {
> > > + io = list_entry(unwritten.next, ext4_io_end_t, list);
> > > + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
> > > + list_del_init(&io->list);
> > > +
> > > + err = ext4_end_io(io);
> > > + if (unlikely(!ret && err))
> > > + ret = err;
> > > +
> > > + list_add_tail(&io->list, &complete);
> > > + }
> > > + /* It is important to update all flags for all end_io in one shot w/o
> > > + * dropping the lock.*/
> > Why? It would seem that once io structures are moved from
> > i_completed_io_list, they are unreachable by anyone else?
You seem to have missed this comment?
> > > + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > > + while (!list_empty(&complete)) {
> > > + io = list_entry(complete.next, ext4_io_end_t, list);
> > > + io->flag &= ~EXT4_IO_END_UNWRITTEN;
> > > + /* end_io context can not be destroyed now because it still
> > > + * used by queued worker. Worker thread will destroy it later */
> > > + if (io->flag & EXT4_IO_END_QUEUED)
> > > + list_del_init(&io->list);
> > > + else
> > > + list_move(&io->list, &to_free);
> > > + }
> > > + /* If we are called from worker context, it is time to clear queued
> > > + * flag, and destroy it's end_io if it was converted already */
> > > + if (work_io) {
> > > + work_io->flag &= ~EXT4_IO_END_QUEUED;
> > > + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
> > > + list_add_tail(&work_io->list, &to_free);
> > > }
> > > - list_del_init(&io->list);
> > > spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> > > - (void) ext4_end_io_nolock(io);
> > > - mutex_unlock(&inode->i_mutex);
> > > -free:
> > > - ext4_free_io_end(io);
> > > +
> > > + while (!list_empty(&to_free)) {
> > > + io = list_entry(to_free.next, ext4_io_end_t, list);
> > > + list_del_init(&io->list);
> > > + ext4_free_io_end(io);
> > > + }
> > > + return ret;
> > > +}
> > > +
> > > +/*
> > > + * work on completed aio dio IO, to convert unwritten extents to extents
> > > + */
> > > +static void ext4_end_io_work(struct work_struct *work)
> > > +{
> > > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > > + ext4_do_flush_completed_IO(io->inode, io);
> > > +}
> > > +
> > > +int ext4_flush_completed_IO(struct inode *inode)
> > > +{
> > > + return ext4_do_flush_completed_IO(inode, NULL);
> > > }
> > Also it seems that when ext4_flush_completed_IO() is called, workqueue
> > can have several IO structures queued in its local lists thus we miss them
> > here and don't properly wait for all conversions?
> No it is not. Because list drained atomically, and
> add_complete_io will queue work only if list is empty.
>
> Race between conversion and dequeue-process is not possible because
> we hold lock during entire walk of complete_list, so from external
> point of view we mark list as conversed(clear unwritten flag)
> happens atomically. I've drawn all possible situations and race not
> happen. If you know any please let me know.
I guess I'm missing something obvious. So let's go step by step:
1) ext4_flush_completed_IO() must make sure there is no outstanding
conversion for the inode.
2) Now assume we have non-empty i_completed_io_list - thus work is queued.
3) The following situation seems to be possible:
CPU1 CPU2
(worker thread) (truncate)
ext4_end_io_work()
ext4_do_flush_completed_IO()
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
dump_completed_IO(inode);
list_replace_init(&ei->i_completed_io_list, &unwritten);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
ext4_flush_completed_IO()
ext4_do_flush_completed_IO()
- sees empty i_completed_io_list
=> exits
But we still have some conversions pending in 'unwritten' list. What am
I missing?
Honza
On Tue, 2 Oct 2012 12:31:41 +0200, Jan Kara <[email protected]> wrote:
> On Tue 02-10-12 11:16:38, Dmitry Monakhov wrote:
> ...
> > > > +static int ext4_do_flush_completed_IO(struct inode *inode,
> > > > + while (!list_empty(&unwritten)) {
> > > > + io = list_entry(unwritten.next, ext4_io_end_t, list);
> > > > + BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
> > > > + list_del_init(&io->list);
> > > > +
> > > > + err = ext4_end_io(io);
> > > > + if (unlikely(!ret && err))
> > > > + ret = err;
> > > > +
> > > > + list_add_tail(&io->list, &complete);
> > > > + }
> > > > + /* It is important to update all flags for all end_io in one shot w/o
> > > > + * dropping the lock.*/
> > > Why? It would seem that once io structures are moved from
> > > i_completed_io_list, they are unreachable by anyone else?
> You seem to have missed this comment?
Yep. i've missed that comment, and yes it is appeared to not important
to update whole list atomically, it may be dropped on each end_io.
>
>
> > > > + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > > > + while (!list_empty(&complete)) {
> > > > + io = list_entry(complete.next, ext4_io_end_t, list);
> > > > + io->flag &= ~EXT4_IO_END_UNWRITTEN;
> > > > + /* end_io context can not be destroyed now because it still
> > > > + * used by queued worker. Worker thread will destroy it later */
> > > > + if (io->flag & EXT4_IO_END_QUEUED)
> > > > + list_del_init(&io->list);
> > > > + else
> > > > + list_move(&io->list, &to_free);
> > > > + }
> > > > + /* If we are called from worker context, it is time to clear queued
> > > > + * flag, and destroy it's end_io if it was converted already */
> > > > + if (work_io) {
> > > > + work_io->flag &= ~EXT4_IO_END_QUEUED;
> > > > + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
> > > > + list_add_tail(&work_io->list, &to_free);
> > > > }
> > > > - list_del_init(&io->list);
> > > > spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> > > > - (void) ext4_end_io_nolock(io);
> > > > - mutex_unlock(&inode->i_mutex);
> > > > -free:
> > > > - ext4_free_io_end(io);
> > > > +
> > > > + while (!list_empty(&to_free)) {
> > > > + io = list_entry(to_free.next, ext4_io_end_t, list);
> > > > + list_del_init(&io->list);
> > > > + ext4_free_io_end(io);
> > > > + }
> > > > + return ret;
> > > > +}
> > > > +
> > > > +/*
> > > > + * work on completed aio dio IO, to convert unwritten extents to extents
> > > > + */
> > > > +static void ext4_end_io_work(struct work_struct *work)
> > > > +{
> > > > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > > > + ext4_do_flush_completed_IO(io->inode, io);
> > > > +}
> > > > +
> > > > +int ext4_flush_completed_IO(struct inode *inode)
> > > > +{
> > > > + return ext4_do_flush_completed_IO(inode, NULL);
> > > > }
> > > Also it seems that when ext4_flush_completed_IO() is called, workqueue
> > > can have several IO structures queued in its local lists thus we miss them
> > > here and don't properly wait for all conversions?
> > No it is not. Because list drained atomically, and
> > add_complete_io will queue work only if list is empty.
> >
> > Race between conversion and dequeue-process is not possible because
> > we hold lock during entire walk of complete_list, so from external
> > point of view we mark list as conversed(clear unwritten flag)
> > happens atomically. I've drawn all possible situations and race not
> > happen. If you know any please let me know.
> I guess I'm missing something obvious. So let's go step by step:
> 1) ext4_flush_completed_IO() must make sure there is no outstanding
> conversion for the inode.
> 2) Now assume we have non-empty i_completed_io_list - thus work is queued.
> 3) The following situation seems to be possible:
>
> CPU1 CPU2
> (worker thread) (truncate)
> ext4_end_io_work()
> ext4_do_flush_completed_IO()
> spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> dump_completed_IO(inode);
> list_replace_init(&ei->i_completed_io_list, &unwritten);
> spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
>
> ext4_flush_completed_IO()
> ext4_do_flush_completed_IO()
> - sees empty i_completed_io_list
> => exits
>
> But we still have some conversions pending in 'unwritten' list. What am
> I missing?
Indeed, I've simply missed that case. The case which result silently
broke integrity sync ;(
Thank you for spotting this. I'll be back with updated version.
>
> Honza
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue 02-10-12 14:57:22, Dmitry Monakhov wrote:
> On Tue, 2 Oct 2012 12:31:41 +0200, Jan Kara <[email protected]> wrote:
> > On Tue 02-10-12 11:16:38, Dmitry Monakhov wrote:
> > > > > + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > > > > + while (!list_empty(&complete)) {
> > > > > + io = list_entry(complete.next, ext4_io_end_t, list);
> > > > > + io->flag &= ~EXT4_IO_END_UNWRITTEN;
> > > > > + /* end_io context can not be destroyed now because it still
> > > > > + * used by queued worker. Worker thread will destroy it later */
> > > > > + if (io->flag & EXT4_IO_END_QUEUED)
> > > > > + list_del_init(&io->list);
> > > > > + else
> > > > > + list_move(&io->list, &to_free);
> > > > > + }
> > > > > + /* If we are called from worker context, it is time to clear queued
> > > > > + * flag, and destroy it's end_io if it was converted already */
> > > > > + if (work_io) {
> > > > > + work_io->flag &= ~EXT4_IO_END_QUEUED;
> > > > > + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
> > > > > + list_add_tail(&work_io->list, &to_free);
> > > > > }
> > > > > - list_del_init(&io->list);
> > > > > spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> > > > > - (void) ext4_end_io_nolock(io);
> > > > > - mutex_unlock(&inode->i_mutex);
> > > > > -free:
> > > > > - ext4_free_io_end(io);
> > > > > +
> > > > > + while (!list_empty(&to_free)) {
> > > > > + io = list_entry(to_free.next, ext4_io_end_t, list);
> > > > > + list_del_init(&io->list);
> > > > > + ext4_free_io_end(io);
> > > > > + }
> > > > > + return ret;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * work on completed aio dio IO, to convert unwritten extents to extents
> > > > > + */
> > > > > +static void ext4_end_io_work(struct work_struct *work)
> > > > > +{
> > > > > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > > > > + ext4_do_flush_completed_IO(io->inode, io);
> > > > > +}
> > > > > +
> > > > > +int ext4_flush_completed_IO(struct inode *inode)
> > > > > +{
> > > > > + return ext4_do_flush_completed_IO(inode, NULL);
> > > > > }
> > > > Also it seems that when ext4_flush_completed_IO() is called, workqueue
> > > > can have several IO structures queued in its local lists thus we miss them
> > > > here and don't properly wait for all conversions?
> > > No it is not. Because list drained atomically, and
> > > add_complete_io will queue work only if list is empty.
> > >
> > > Race between conversion and dequeue-process is not possible because
> > > we hold lock during entire walk of complete_list, so from external
> > > point of view we mark list as conversed(clear unwritten flag)
> > > happens atomically. I've drawn all possible situations and race not
> > > happen. If you know any please let me know.
> > I guess I'm missing something obvious. So let's go step by step:
> > 1) ext4_flush_completed_IO() must make sure there is no outstanding
> > conversion for the inode.
> > 2) Now assume we have non-empty i_completed_io_list - thus work is queued.
> > 3) The following situation seems to be possible:
> >
> > CPU1 CPU2
> > (worker thread) (truncate)
> > ext4_end_io_work()
> > ext4_do_flush_completed_IO()
> > spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > dump_completed_IO(inode);
> > list_replace_init(&ei->i_completed_io_list, &unwritten);
> > spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> >
> > ext4_flush_completed_IO()
> > ext4_do_flush_completed_IO()
> > - sees empty i_completed_io_list
> > => exits
> >
> > But we still have some conversions pending in 'unwritten' list. What am
> > I missing?
> Indeed, I've simply missed that case. The case which result silently
> broke integrity sync ;(
> Thank you for spotting this. I'll be back with updated version.
Umm, actually, I was thinking about it and ext4_flush_completed_IO()
seems to be unnecessary in fsync these days. We don't call aio_complete()
until we perform the conversion so what fsync does to such IO is undefined.
Such optimization is a separate matter though.
But for truncate or punch hole, it is critical that all conversions are
really flushed.
Honza
On Tue, 2 Oct 2012 13:11:06 +0200, Jan Kara <[email protected]> wrote:
> On Tue 02-10-12 14:57:22, Dmitry Monakhov wrote:
> > On Tue, 2 Oct 2012 12:31:41 +0200, Jan Kara <[email protected]> wrote:
> > > On Tue 02-10-12 11:16:38, Dmitry Monakhov wrote:
> > > > > > + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > > > > > + while (!list_empty(&complete)) {
> > > > > > + io = list_entry(complete.next, ext4_io_end_t, list);
> > > > > > + io->flag &= ~EXT4_IO_END_UNWRITTEN;
> > > > > > + /* end_io context can not be destroyed now because it still
> > > > > > + * used by queued worker. Worker thread will destroy it later */
> > > > > > + if (io->flag & EXT4_IO_END_QUEUED)
> > > > > > + list_del_init(&io->list);
> > > > > > + else
> > > > > > + list_move(&io->list, &to_free);
> > > > > > + }
> > > > > > + /* If we are called from worker context, it is time to clear queued
> > > > > > + * flag, and destroy it's end_io if it was converted already */
> > > > > > + if (work_io) {
> > > > > > + work_io->flag &= ~EXT4_IO_END_QUEUED;
> > > > > > + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
> > > > > > + list_add_tail(&work_io->list, &to_free);
> > > > > > }
> > > > > > - list_del_init(&io->list);
> > > > > > spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> > > > > > - (void) ext4_end_io_nolock(io);
> > > > > > - mutex_unlock(&inode->i_mutex);
> > > > > > -free:
> > > > > > - ext4_free_io_end(io);
> > > > > > +
> > > > > > + while (!list_empty(&to_free)) {
> > > > > > + io = list_entry(to_free.next, ext4_io_end_t, list);
> > > > > > + list_del_init(&io->list);
> > > > > > + ext4_free_io_end(io);
> > > > > > + }
> > > > > > + return ret;
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > + * work on completed aio dio IO, to convert unwritten extents to extents
> > > > > > + */
> > > > > > +static void ext4_end_io_work(struct work_struct *work)
> > > > > > +{
> > > > > > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > > > > > + ext4_do_flush_completed_IO(io->inode, io);
> > > > > > +}
> > > > > > +
> > > > > > +int ext4_flush_completed_IO(struct inode *inode)
> > > > > > +{
> > > > > > + return ext4_do_flush_completed_IO(inode, NULL);
> > > > > > }
> > > > > Also it seems that when ext4_flush_completed_IO() is called, workqueue
> > > > > can have several IO structures queued in its local lists thus we miss them
> > > > > here and don't properly wait for all conversions?
> > > > No it is not. Because list drained atomically, and
> > > > add_complete_io will queue work only if list is empty.
> > > >
> > > > Race between conversion and dequeue-process is not possible because
> > > > we hold lock during entire walk of complete_list, so from external
> > > > point of view we mark list as conversed(clear unwritten flag)
> > > > happens atomically. I've drawn all possible situations and race not
> > > > happen. If you know any please let me know.
> > > I guess I'm missing something obvious. So let's go step by step:
> > > 1) ext4_flush_completed_IO() must make sure there is no outstanding
> > > conversion for the inode.
> > > 2) Now assume we have non-empty i_completed_io_list - thus work is queued.
> > > 3) The following situation seems to be possible:
> > >
> > > CPU1 CPU2
> > > (worker thread) (truncate)
> > > ext4_end_io_work()
> > > ext4_do_flush_completed_IO()
> > > spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > > dump_completed_IO(inode);
> > > list_replace_init(&ei->i_completed_io_list, &unwritten);
> > > spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> > >
> > > ext4_flush_completed_IO()
> > > ext4_do_flush_completed_IO()
> > > - sees empty i_completed_io_list
> > > => exits
> > >
> > > But we still have some conversions pending in 'unwritten' list. What am
> > > I missing?
> > Indeed, I've simply missed that case. The case which result silently
> > broke integrity sync ;(
> > Thank you for spotting this. I'll be back with updated version.
> Umm, actually, I was thinking about it and ext4_flush_completed_IO()
> seems to be unnecessary in fsync these days. We don't call aio_complete()
> until we perform the conversion so what fsync does to such IO is undefined.
> Such optimization is a separate matter though.
Yes aio is ok, but integrity fsync after buffered write to unwritten
extent is broken.
fsync() blkdev_completion kwork
->filemap_write_and_wait_range
->ext4_end_bio
->end_page_writeback
<-- filemap_write_and_wait_range return
->ext4_add_complete_io
->ext4_do_flush_completed_IO
->list_replace_init
->ext4_flush_completed_IO
sees empty i_comleted_io_list but pended
conversion still exist
->ext4_end_io
>
> But for truncate or punch hole, it is critical that all conversions are
> really flushed.
>
> Honza
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue 02-10-12 16:42:39, Dmitry Monakhov wrote:
> On Tue, 2 Oct 2012 13:11:06 +0200, Jan Kara <[email protected]> wrote:
> > On Tue 02-10-12 14:57:22, Dmitry Monakhov wrote:
> > > On Tue, 2 Oct 2012 12:31:41 +0200, Jan Kara <[email protected]> wrote:
> > > > On Tue 02-10-12 11:16:38, Dmitry Monakhov wrote:
> > > > > > > + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > > > > > > + while (!list_empty(&complete)) {
> > > > > > > + io = list_entry(complete.next, ext4_io_end_t, list);
> > > > > > > + io->flag &= ~EXT4_IO_END_UNWRITTEN;
> > > > > > > + /* end_io context can not be destroyed now because it still
> > > > > > > + * used by queued worker. Worker thread will destroy it later */
> > > > > > > + if (io->flag & EXT4_IO_END_QUEUED)
> > > > > > > + list_del_init(&io->list);
> > > > > > > + else
> > > > > > > + list_move(&io->list, &to_free);
> > > > > > > + }
> > > > > > > + /* If we are called from worker context, it is time to clear queued
> > > > > > > + * flag, and destroy it's end_io if it was converted already */
> > > > > > > + if (work_io) {
> > > > > > > + work_io->flag &= ~EXT4_IO_END_QUEUED;
> > > > > > > + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
> > > > > > > + list_add_tail(&work_io->list, &to_free);
> > > > > > > }
> > > > > > > - list_del_init(&io->list);
> > > > > > > spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> > > > > > > - (void) ext4_end_io_nolock(io);
> > > > > > > - mutex_unlock(&inode->i_mutex);
> > > > > > > -free:
> > > > > > > - ext4_free_io_end(io);
> > > > > > > +
> > > > > > > + while (!list_empty(&to_free)) {
> > > > > > > + io = list_entry(to_free.next, ext4_io_end_t, list);
> > > > > > > + list_del_init(&io->list);
> > > > > > > + ext4_free_io_end(io);
> > > > > > > + }
> > > > > > > + return ret;
> > > > > > > +}
> > > > > > > +
> > > > > > > +/*
> > > > > > > + * work on completed aio dio IO, to convert unwritten extents to extents
> > > > > > > + */
> > > > > > > +static void ext4_end_io_work(struct work_struct *work)
> > > > > > > +{
> > > > > > > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > > > > > > + ext4_do_flush_completed_IO(io->inode, io);
> > > > > > > +}
> > > > > > > +
> > > > > > > +int ext4_flush_completed_IO(struct inode *inode)
> > > > > > > +{
> > > > > > > + return ext4_do_flush_completed_IO(inode, NULL);
> > > > > > > }
> > > > > > Also it seems that when ext4_flush_completed_IO() is called, workqueue
> > > > > > can have several IO structures queued in its local lists thus we miss them
> > > > > > here and don't properly wait for all conversions?
> > > > > No it is not. Because list drained atomically, and
> > > > > add_complete_io will queue work only if list is empty.
> > > > >
> > > > > Race between conversion and dequeue-process is not possible because
> > > > > we hold lock during entire walk of complete_list, so from external
> > > > > point of view we mark list as conversed(clear unwritten flag)
> > > > > happens atomically. I've drawn all possible situations and race not
> > > > > happen. If you know any please let me know.
> > > > I guess I'm missing something obvious. So let's go step by step:
> > > > 1) ext4_flush_completed_IO() must make sure there is no outstanding
> > > > conversion for the inode.
> > > > 2) Now assume we have non-empty i_completed_io_list - thus work is queued.
> > > > 3) The following situation seems to be possible:
> > > >
> > > > CPU1 CPU2
> > > > (worker thread) (truncate)
> > > > ext4_end_io_work()
> > > > ext4_do_flush_completed_IO()
> > > > spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > > > dump_completed_IO(inode);
> > > > list_replace_init(&ei->i_completed_io_list, &unwritten);
> > > > spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> > > >
> > > > ext4_flush_completed_IO()
> > > > ext4_do_flush_completed_IO()
> > > > - sees empty i_completed_io_list
> > > > => exits
> > > >
> > > > But we still have some conversions pending in 'unwritten' list. What am
> > > > I missing?
> > > Indeed, I've simply missed that case. The case which result silently
> > > broke integrity sync ;(
> > > Thank you for spotting this. I'll be back with updated version.
> > Umm, actually, I was thinking about it and ext4_flush_completed_IO()
> > seems to be unnecessary in fsync these days. We don't call aio_complete()
> > until we perform the conversion so what fsync does to such IO is undefined.
> > Such optimization is a separate matter though.
> Yes aio is ok, but integrity fsync after buffered write to unwritten
> extent is broken.
>
> fsync() blkdev_completion kwork
> ->filemap_write_and_wait_range
> ->ext4_end_bio
> ->end_page_writeback
> <-- filemap_write_and_wait_range return
> ->ext4_add_complete_io
>
> ->ext4_do_flush_completed_IO
> ->list_replace_init
> ->ext4_flush_completed_IO
> sees empty i_comleted_io_list but pended
> conversion still exist
> ->ext4_end_io
>
Correct. Thanks for pointing that out.
Honza
On Tue, 2 Oct 2012 15:30:19 +0200, Jan Kara <[email protected]> wrote:
> On Tue 02-10-12 16:42:39, Dmitry Monakhov wrote:
> > On Tue, 2 Oct 2012 13:11:06 +0200, Jan Kara <[email protected]> wrote:
> > > On Tue 02-10-12 14:57:22, Dmitry Monakhov wrote:
> > > > On Tue, 2 Oct 2012 12:31:41 +0200, Jan Kara <[email protected]> wrote:
> > > > > On Tue 02-10-12 11:16:38, Dmitry Monakhov wrote:
> > > > > > > > + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > > > > > > > + while (!list_empty(&complete)) {
> > > > > > > > + io = list_entry(complete.next, ext4_io_end_t, list);
> > > > > > > > + io->flag &= ~EXT4_IO_END_UNWRITTEN;
> > > > > > > > + /* end_io context can not be destroyed now because it still
> > > > > > > > + * used by queued worker. Worker thread will destroy it later */
> > > > > > > > + if (io->flag & EXT4_IO_END_QUEUED)
> > > > > > > > + list_del_init(&io->list);
> > > > > > > > + else
> > > > > > > > + list_move(&io->list, &to_free);
> > > > > > > > + }
> > > > > > > > + /* If we are called from worker context, it is time to clear queued
> > > > > > > > + * flag, and destroy it's end_io if it was converted already */
> > > > > > > > + if (work_io) {
> > > > > > > > + work_io->flag &= ~EXT4_IO_END_QUEUED;
> > > > > > > > + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
> > > > > > > > + list_add_tail(&work_io->list, &to_free);
> > > > > > > > }
> > > > > > > > - list_del_init(&io->list);
> > > > > > > > spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> > > > > > > > - (void) ext4_end_io_nolock(io);
> > > > > > > > - mutex_unlock(&inode->i_mutex);
> > > > > > > > -free:
> > > > > > > > - ext4_free_io_end(io);
> > > > > > > > +
> > > > > > > > + while (!list_empty(&to_free)) {
> > > > > > > > + io = list_entry(to_free.next, ext4_io_end_t, list);
> > > > > > > > + list_del_init(&io->list);
> > > > > > > > + ext4_free_io_end(io);
> > > > > > > > + }
> > > > > > > > + return ret;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +/*
> > > > > > > > + * work on completed aio dio IO, to convert unwritten extents to extents
> > > > > > > > + */
> > > > > > > > +static void ext4_end_io_work(struct work_struct *work)
> > > > > > > > +{
> > > > > > > > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > > > > > > > + ext4_do_flush_completed_IO(io->inode, io);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +int ext4_flush_completed_IO(struct inode *inode)
> > > > > > > > +{
> > > > > > > > + return ext4_do_flush_completed_IO(inode, NULL);
> > > > > > > > }
> > > > > > > Also it seems that when ext4_flush_completed_IO() is called, workqueue
> > > > > > > can have several IO structures queued in its local lists thus we miss them
> > > > > > > here and don't properly wait for all conversions?
> > > > > > No it is not. Because list drained atomically, and
> > > > > > add_complete_io will queue work only if list is empty.
> > > > > >
> > > > > > Race between conversion and dequeue-process is not possible because
> > > > > > we hold lock during entire walk of complete_list, so from external
> > > > > > point of view we mark list as conversed(clear unwritten flag)
> > > > > > happens atomically. I've drawn all possible situations and race not
> > > > > > happen. If you know any please let me know.
> > > > > I guess I'm missing something obvious. So let's go step by step:
> > > > > 1) ext4_flush_completed_IO() must make sure there is no outstanding
> > > > > conversion for the inode.
> > > > > 2) Now assume we have non-empty i_completed_io_list - thus work is queued.
> > > > > 3) The following situation seems to be possible:
> > > > >
> > > > > CPU1 CPU2
> > > > > (worker thread) (truncate)
> > > > > ext4_end_io_work()
> > > > > ext4_do_flush_completed_IO()
> > > > > spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > > > > dump_completed_IO(inode);
> > > > > list_replace_init(&ei->i_completed_io_list, &unwritten);
> > > > > spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> > > > >
> > > > > ext4_flush_completed_IO()
> > > > > ext4_do_flush_completed_IO()
> > > > > - sees empty i_completed_io_list
> > > > > => exits
> > > > >
> > > > > But we still have some conversions pending in 'unwritten' list. What am
> > > > > I missing?
> > > > Indeed, I've simply missed that case. The case which result silently
> > > > broke integrity sync ;(
> > > > Thank you for spotting this. I'll be back with updated version.
> > > Umm, actually, I was thinking about it and ext4_flush_completed_IO()
> > > seems to be unnecessary in fsync these days. We don't call aio_complete()
> > > until we perform the conversion so what fsync does to such IO is undefined.
> > > Such optimization is a separate matter though.
> > Yes aio is ok, but integrity fsync after buffered write to unwritten
> > extent is broken.
> >
> > fsync() blkdev_completion kwork
> > ->filemap_write_and_wait_range
> > ->ext4_end_bio
> > ->end_page_writeback
> > <-- filemap_write_and_wait_range return
> > ->ext4_add_complete_io
> >
> > ->ext4_do_flush_completed_IO
> > ->list_replace_init
> > ->ext4_flush_completed_IO
> > sees empty i_comleted_io_list but pended
> > conversion still exist
> > ->ext4_end_io
> >
> Correct. Thanks for pointing that out.
In my deference I should say that integrity fsync was broken before my patch in
case of buffered writes because end_page_writeback called before
end_io added to ei->i_comlete_io_list
fsync() blkdev_completion
->filemap_write_and_wait_range
->ext4_end_bio
->end_page_writeback
<-- filemap_write_and_wait_range return
->ext4_flush_completed_IO
sees empty i_comleted_io_list but pended
conversion still exist
->ext4_add_complete_io
On Wed 03-10-12 15:21:25, Dmitry Monakhov wrote:
> On Tue, 2 Oct 2012 15:30:19 +0200, Jan Kara <[email protected]> wrote:
> > On Tue 02-10-12 16:42:39, Dmitry Monakhov wrote:
> > > On Tue, 2 Oct 2012 13:11:06 +0200, Jan Kara <[email protected]> wrote:
> > > > On Tue 02-10-12 14:57:22, Dmitry Monakhov wrote:
> > > > > On Tue, 2 Oct 2012 12:31:41 +0200, Jan Kara <[email protected]> wrote:
> > > > > > On Tue 02-10-12 11:16:38, Dmitry Monakhov wrote:
> > > > > > > > > + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > > > > > > > > + while (!list_empty(&complete)) {
> > > > > > > > > + io = list_entry(complete.next, ext4_io_end_t, list);
> > > > > > > > > + io->flag &= ~EXT4_IO_END_UNWRITTEN;
> > > > > > > > > + /* end_io context can not be destroyed now because it still
> > > > > > > > > + * used by queued worker. Worker thread will destroy it later */
> > > > > > > > > + if (io->flag & EXT4_IO_END_QUEUED)
> > > > > > > > > + list_del_init(&io->list);
> > > > > > > > > + else
> > > > > > > > > + list_move(&io->list, &to_free);
> > > > > > > > > + }
> > > > > > > > > + /* If we are called from worker context, it is time to clear queued
> > > > > > > > > + * flag, and destroy it's end_io if it was converted already */
> > > > > > > > > + if (work_io) {
> > > > > > > > > + work_io->flag &= ~EXT4_IO_END_QUEUED;
> > > > > > > > > + if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
> > > > > > > > > + list_add_tail(&work_io->list, &to_free);
> > > > > > > > > }
> > > > > > > > > - list_del_init(&io->list);
> > > > > > > > > spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> > > > > > > > > - (void) ext4_end_io_nolock(io);
> > > > > > > > > - mutex_unlock(&inode->i_mutex);
> > > > > > > > > -free:
> > > > > > > > > - ext4_free_io_end(io);
> > > > > > > > > +
> > > > > > > > > + while (!list_empty(&to_free)) {
> > > > > > > > > + io = list_entry(to_free.next, ext4_io_end_t, list);
> > > > > > > > > + list_del_init(&io->list);
> > > > > > > > > + ext4_free_io_end(io);
> > > > > > > > > + }
> > > > > > > > > + return ret;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +/*
> > > > > > > > > + * work on completed aio dio IO, to convert unwritten extents to extents
> > > > > > > > > + */
> > > > > > > > > +static void ext4_end_io_work(struct work_struct *work)
> > > > > > > > > +{
> > > > > > > > > + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> > > > > > > > > + ext4_do_flush_completed_IO(io->inode, io);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +int ext4_flush_completed_IO(struct inode *inode)
> > > > > > > > > +{
> > > > > > > > > + return ext4_do_flush_completed_IO(inode, NULL);
> > > > > > > > > }
> > > > > > > > Also it seems that when ext4_flush_completed_IO() is called, workqueue
> > > > > > > > can have several IO structures queued in its local lists thus we miss them
> > > > > > > > here and don't properly wait for all conversions?
> > > > > > > No it is not. Because list drained atomically, and
> > > > > > > add_complete_io will queue work only if list is empty.
> > > > > > >
> > > > > > > Race between conversion and dequeue-process is not possible because
> > > > > > > we hold lock during entire walk of complete_list, so from external
> > > > > > > point of view we mark list as conversed(clear unwritten flag)
> > > > > > > happens atomically. I've drawn all possible situations and race not
> > > > > > > happen. If you know any please let me know.
> > > > > > I guess I'm missing something obvious. So let's go step by step:
> > > > > > 1) ext4_flush_completed_IO() must make sure there is no outstanding
> > > > > > conversion for the inode.
> > > > > > 2) Now assume we have non-empty i_completed_io_list - thus work is queued.
> > > > > > 3) The following situation seems to be possible:
> > > > > >
> > > > > > CPU1 CPU2
> > > > > > (worker thread) (truncate)
> > > > > > ext4_end_io_work()
> > > > > > ext4_do_flush_completed_IO()
> > > > > > spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> > > > > > dump_completed_IO(inode);
> > > > > > list_replace_init(&ei->i_completed_io_list, &unwritten);
> > > > > > spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> > > > > >
> > > > > > ext4_flush_completed_IO()
> > > > > > ext4_do_flush_completed_IO()
> > > > > > - sees empty i_completed_io_list
> > > > > > => exits
> > > > > >
> > > > > > But we still have some conversions pending in 'unwritten' list. What am
> > > > > > I missing?
> > > > > Indeed, I've simply missed that case. The case which result silently
> > > > > broke integrity sync ;(
> > > > > Thank you for spotting this. I'll be back with updated version.
> > > > Umm, actually, I was thinking about it and ext4_flush_completed_IO()
> > > > seems to be unnecessary in fsync these days. We don't call aio_complete()
> > > > until we perform the conversion so what fsync does to such IO is undefined.
> > > > Such optimization is a separate matter though.
> > > Yes aio is ok, but integrity fsync after buffered write to unwritten
> > > extent is broken.
> > >
> > > fsync() blkdev_completion kwork
> > > ->filemap_write_and_wait_range
> > > ->ext4_end_bio
> > > ->end_page_writeback
> > > <-- filemap_write_and_wait_range return
> > > ->ext4_add_complete_io
> > >
> > > ->ext4_do_flush_completed_IO
> > > ->list_replace_init
> > > ->ext4_flush_completed_IO
> > > sees empty i_comleted_io_list but pended
> > > conversion still exist
> > > ->ext4_end_io
> > >
> > Correct. Thanks for pointing that out.
> In my deference I should say that integrity fsync was broken before my patch in
> case of buffered writes because end_page_writeback called before
> end_io added to ei->i_comlete_io_list
> fsync() blkdev_completion
> ->filemap_write_and_wait_range
> ->ext4_end_bio
> ->end_page_writeback
> <-- filemap_write_and_wait_range return
> ->ext4_flush_completed_IO
> sees empty i_comleted_io_list but pended
> conversion still exist
> ->ext4_add_complete_io
Right. Actually calling end_page_writeback() before we are sure page can
be correctly reloaded from disk (i.e. before all extent manipulations are
done) is asking for trouble - see e.g. mail
http://lists.openwall.net/linux-ext4/2011/06/08/12 and further discussion.
The discussion was somewhat open-ended but at that time calling
end_page_writeback() after extent conversion was problematic because of
i_mutex. Now we don't need i_mutex for extent conversion, so it is save to
call end_page_writeback() after we convert the extents. So moving
end_page_writeback() there would be good and it would simplify come logic
as well I believe - in particular fsync() would be simpler.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
Hi
On Fri, Sep 28, 2012 at 8:44 AM, Dmitry Monakhov <[email protected]> wrote:
> RFC_MESSAGE: It is up to committer whenever pick or drop this patch.
> Only one user exist, so it may be resonable move it inside
> caller's body. The only disadvantage is that makes end_do_flush_completed_IO()
> less readable.
>
>
> COMMIT_MESSAGE:
> ext4_do_flush_completed_IO() is the only user of this function.
>
> Signed-off-by: Dmitry Monakhov <[email protected]>
> ---
> fs/ext4/page-io.c | 56 ++++++++++++++++++++--------------------------------
> 1 files changed, 22 insertions(+), 34 deletions(-)
>
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 5b24c40..0435688 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -84,37 +84,6 @@ void ext4_free_io_end(ext4_io_end_t *io)
> kmem_cache_free(io_end_cachep, io);
> }
>
> -/* check a range of space and convert unwritten extents to written. */
> -static int ext4_end_io(ext4_io_end_t *io)
> -{
> - struct inode *inode = io->inode;
> - loff_t offset = io->offset;
> - ssize_t size = io->size;
> - int ret = 0;
> -
> - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
> - "list->prev 0x%p\n",
> - io, inode->i_ino, io->list.next, io->list.prev);
> -
> - ret = ext4_convert_unwritten_extents(inode, offset, size);
> - if (ret < 0) {
> - ext4_msg(inode->i_sb, KERN_EMERG,
> - "failed to convert unwritten extents to written "
> - "extents -- potential data loss! "
> - "(inode %lu, offset %llu, size %zd, error %d)",
> - inode->i_ino, offset, size, ret);
> - }
> - if (io->iocb)
> - aio_complete(io->iocb, io->result, 0);
> -
> - if (io->flag & EXT4_IO_END_DIRECT)
> - inode_dio_done(inode);
> - /* Wake up anyone waiting on unwritten extent conversion */
> - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
> - wake_up_all(ext4_ioend_wq(io->inode));
> - return ret;
> -}
> -
> static void dump_completed_IO(struct inode *inode)
> {
> #ifdef EXT4FS_DEBUG
> @@ -183,9 +152,28 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
> BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
> list_del_init(&io->list);
>
> - err = ext4_end_io(io);
> - if (unlikely(!ret && err))
> - ret = err;
> + ext4_debug("ext4_do_flush_completed_IO: io 0x%p, inode %lu\n",
> + io, inode->i_ino);
> +
> + err = ext4_convert_unwritten_extents(inode, io->offset,
> + io->size);
> + if (err < 0) {
> + ext4_msg(inode->i_sb, KERN_EMERG,
> + "failed to convert unwritten extents to written"
> + " extents -- potential data loss! "
> + "(inode %lu, offset %llu, size %zd, error %d)",
> + inode->i_ino, io->offset, io->size, err);
> + if (!ret)
> + ret = err;
> + }
> + if (io->iocb)
> + aio_complete(io->iocb, io->result, 0);
> +
> + if (io->flag & EXT4_IO_END_DIRECT)
> + inode_dio_done(inode);
> + /* Wake up anyone waiting on unwritten extent conversion */
> + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
> + wake_up_all(ext4_ioend_wq(io->inode));
Should we use "inode" instead of "io->inode"?
>
> list_add_tail(&io->list, &complete);
> }
> --
> 1.7.7.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
I ended up dropping this patch since it doesn't make any difference to
the generated code (gcc will take a static function which is only used
in one place, and inline it) and it makes a bit more understandable to
have ext4_end_io() as a separate function.
> > + /* Wake up anyone waiting on unwritten extent conversion */
> > + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
> > + wake_up_all(ext4_ioend_wq(io->inode));
>
> Should we use "inode" instead of "io->inode"?
I agree it would be a bit cleaner/more readable, but again it won't
make a difference to the generated assembly, and while we oculd do
this in the original code in ext4_end_io(), I'm trying to put this
patch series to bed so I can push it to Linus, and since we're now not
touching the code, it's not worth it to clean this up now. We can
take of this later....
- Ted