The first patch ensures ->end_io is always called for direct I/O requests
that pass it in, even if there was a zero length write, or if an error
occured. The existing users have been updated to ignore it, but XFS
will make use of it in the future, and a comment in ext4 suggests it
might be useful for it as well.
The other two simplify the XFS direct I/O code.
_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs
See http://www.infradead.org/rpr.html
This way we can pass back errors to the file system, and allow for
cleanup required for all direct I/O invocations.
Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/dax.c | 2 +-
fs/direct-io.c | 4 ++--
fs/ext4/inode.c | 3 +++
fs/ocfs2/aops.c | 3 +++
fs/xfs/xfs_aops.c | 3 +++
5 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 4fd6b0c..e47d1ba 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -267,7 +267,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
inode_unlock(inode);
- if ((retval > 0) && end_io)
+ if (end_io)
end_io(iocb, pos, retval, bh.b_private);
if (!(flags & DIO_SKIP_DIO_COUNT))
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1b2f7ff..ead7ac1 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -253,8 +253,8 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
if (ret == 0)
ret = transferred;
- if (dio->end_io && dio->result)
- dio->end_io(dio->iocb, offset, transferred, dio->private);
+ if (dio->end_io)
+ dio->end_io(dio->iocb, offset, ret, dio->private);
if (!(dio->flags & DIO_SKIP_DIO_COUNT))
inode_dio_end(dio->inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 83bc8bf..d04555b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3166,6 +3166,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
{
ext4_io_end_t *io_end = iocb->private;
+ if (size <= 0)
+ return;
+
/* if not async direct IO just return */
if (!io_end)
return;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 794fd15..1eeb804 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -628,6 +628,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
struct inode *inode = file_inode(iocb->ki_filp);
int level;
+ if (bytes <= 0)
+ return;
+
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 379c089..c318e9f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1655,6 +1655,9 @@ xfs_end_io_direct_write(
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_ioend *ioend = private;
+ if (size <= 0)
+ return;
+
trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
ioend ? ioend->io_type : 0, NULL);
--
2.1.4
We only need to communicate two bits of information to the direct I/O
completion handler:
(1) do we need to convert any unwritten extents in the range
(2) do we need to check if we need to update the inode size based
on the range passed to the completion handler
We can use the private data passed to the get_block handler and the
completion handler as a simple bitmask to communicate this information
instead of the current complicated infrastructure reusing the ioends
from the buffer I/O path, and thus avoiding a memory allocation and
a context switch for any non-trivial direct write. As a nice side
effect we also decouple the direct I/O path implementation from that
of the buffered I/O path.
Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/xfs/xfs_aops.c | 216 ++++++++++++++++++-----------------------------------
fs/xfs/xfs_trace.h | 9 +--
2 files changed, 77 insertions(+), 148 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index c318e9f..f6b08ea 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,10 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>
+/* flags for direct write completions */
+#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
+#define XFS_DIO_FLAG_APPEND (1 << 1)
+
void
xfs_count_page_state(
struct page *page,
@@ -1238,27 +1242,8 @@ xfs_vm_releasepage(
}
/*
- * When we map a DIO buffer, we may need to attach an ioend that describes the
- * type of write IO we are doing. This passes to the completion function the
- * operations it needs to perform. If the mapping is for an overwrite wholly
- * within the EOF then we don't need an ioend and so we don't allocate one.
- * This avoids the unnecessary overhead of allocating and freeing ioends for
- * workloads that don't require transactions on IO completion.
- *
- * If we get multiple mappings in a single IO, we might be mapping different
- * types. But because the direct IO can only have a single private pointer, we
- * need to ensure that:
- *
- * a) i) the ioend spans the entire region of unwritten mappings; or
- * ii) the ioend spans all the mappings that cross or are beyond EOF; and
- * b) if it contains unwritten extents, it is *permanently* marked as such
- *
- * We could do this by chaining ioends like buffered IO does, but we only
- * actually get one IO completion callback from the direct IO, and that spans
- * the entire IO regardless of how many mappings and IOs are needed to complete
- * the DIO. There is only going to be one reference to the ioend and its life
- * cycle is constrained by the DIO completion code. hence we don't need
- * reference counting here.
+ * When we map a DIO buffer, we may need to pass flags to
+ * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
*
* Note that for DIO, an IO to the highest supported file block offset (i.e.
* 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
@@ -1266,68 +1251,26 @@ xfs_vm_releasepage(
* extending the file size. We won't know for sure until IO completion is run
* and the actual max write offset is communicated to the IO completion
* routine.
- *
- * For DAX page faults, we are preparing to never see unwritten extents here,
- * nor should we ever extend the inode size. Hence we will soon have nothing to
- * do here for this case, ensuring we don't have to provide an IO completion
- * callback to free an ioend that we don't actually need for a fault into the
- * page at offset (2^63 - 1FSB) bytes.
*/
-
static void
xfs_map_direct(
struct inode *inode,
struct buffer_head *bh_result,
struct xfs_bmbt_irec *imap,
- xfs_off_t offset,
- bool dax_fault)
+ xfs_off_t offset)
{
- struct xfs_ioend *ioend;
+ uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
xfs_off_t size = bh_result->b_size;
- int type;
-
- if (ISUNWRITTEN(imap))
- type = XFS_IO_UNWRITTEN;
- else
- type = XFS_IO_OVERWRITE;
-
- trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
-
- if (dax_fault) {
- ASSERT(type == XFS_IO_OVERWRITE);
- trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
- imap);
- return;
- }
- if (bh_result->b_private) {
- ioend = bh_result->b_private;
- ASSERT(ioend->io_size > 0);
- ASSERT(offset >= ioend->io_offset);
- if (offset + size > ioend->io_offset + ioend->io_size)
- ioend->io_size = offset - ioend->io_offset + size;
-
- if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
- ioend->io_type = XFS_IO_UNWRITTEN;
-
- trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
- ioend->io_size, ioend->io_type,
- imap);
- } else if (type == XFS_IO_UNWRITTEN ||
- offset + size > i_size_read(inode) ||
- offset + size < 0) {
- ioend = xfs_alloc_ioend(inode, type);
- ioend->io_offset = offset;
- ioend->io_size = size;
+ trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
+ ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
- bh_result->b_private = ioend;
+ if (ISUNWRITTEN(imap)) {
+ *flags |= XFS_DIO_FLAG_UNWRITTEN;
+ set_buffer_defer_completion(bh_result);
+ } else if (offset + size > i_size_read(inode) || offset + size < 0) {
+ *flags |= XFS_DIO_FLAG_APPEND;
set_buffer_defer_completion(bh_result);
-
- trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
- imap);
- } else {
- trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
- imap);
}
}
@@ -1498,9 +1441,12 @@ __xfs_get_blocks(
if (ISUNWRITTEN(&imap))
set_buffer_unwritten(bh_result);
/* direct IO needs special help */
- if (create && direct)
- xfs_map_direct(inode, bh_result, &imap, offset,
- dax_fault);
+ if (create && direct) {
+ if (dax_fault)
+ ASSERT(!ISUNWRITTEN(&imap));
+ else
+ xfs_map_direct(inode, bh_result, &imap, offset);
+ }
}
/*
@@ -1570,42 +1516,50 @@ xfs_get_blocks_dax_fault(
return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
}
-static void
-__xfs_end_io_direct_write(
- struct inode *inode,
- struct xfs_ioend *ioend,
+/*
+ * Complete a direct I/O write request.
+ *
+ * xfs_map_direct passes us some flags in the private data to tell us what to
+ * do. If not flags are set, then the write IO is an overwrite wholly within
+ * the existing allocated file size and so there is nothing for us to do.
+ *
+ * Note that in this case the completion can be called in interrupt context,
+ * whereas if we have flags set we will always be called in task context
+ * (i.e. from a workqueue).
+ */
+STATIC void
+xfs_end_io_direct_write(
+ struct kiocb *iocb,
loff_t offset,
- ssize_t size)
+ ssize_t size,
+ void *private)
{
- struct xfs_mount *mp = XFS_I(inode)->i_mount;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ uintptr_t flags = (uintptr_t)private;
+ int error = 0;
- if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
- goto out_end_io;
+ trace_xfs_end_io_direct_write(ip, offset, size);
- /*
- * dio completion end_io functions are only called on writes if more
- * than 0 bytes was written.
- */
- ASSERT(size > 0);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return;
- /*
- * The ioend only maps whole blocks, while the IO may be sector aligned.
- * Hence the ioend offset/size may not match the IO offset/size exactly.
- * Because we don't map overwrites within EOF into the ioend, the offset
- * may not match, but only if the endio spans EOF. Either way, write
- * the IO sizes into the ioend so that completion processing does the
- * right thing.
- */
- ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
- ioend->io_size = size;
- ioend->io_offset = offset;
+ if (size <= 0)
+ return;
/*
- * The ioend tells us whether we are doing unwritten extent conversion
+ * The flags tell us whether we are doing unwritten extent conversion
* or an append transaction that updates the on-disk file size. These
* cases are the only cases where we should *potentially* be needing
* to update the VFS inode size.
- *
+ */
+ if (flags == 0) {
+ ASSERT(offset + size <= i_size_read(inode));
+ return;
+ }
+
+ /*
* We need to update the in-core inode size here so that we don't end up
* with the on-disk inode size being outside the in-core inode size. We
* have no other method of updating EOF for AIO, so always do it here
@@ -1616,57 +1570,33 @@ __xfs_end_io_direct_write(
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
*/
- spin_lock(&XFS_I(inode)->i_flags_lock);
+ spin_lock(&ip->i_flags_lock);
if (offset + size > i_size_read(inode))
i_size_write(inode, offset + size);
- spin_unlock(&XFS_I(inode)->i_flags_lock);
+ spin_unlock(&ip->i_flags_lock);
- /*
- * If we are doing an append IO that needs to update the EOF on disk,
- * do the transaction reserve now so we can use common end io
- * processing. Stashing the error (if there is one) in the ioend will
- * result in the ioend processing passing on the error if it is
- * possible as we can't return it from here.
- */
- if (ioend->io_type == XFS_IO_OVERWRITE)
- ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
+ if (flags & XFS_DIO_FLAG_UNWRITTEN) {
+ trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
-out_end_io:
- xfs_end_io(&ioend->io_work);
- return;
-}
+ error = xfs_iomap_write_unwritten(ip, offset, size);
-/*
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC void
-xfs_end_io_direct_write(
- struct kiocb *iocb,
- loff_t offset,
- ssize_t size,
- void *private)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct xfs_ioend *ioend = private;
+ WARN_ON_ONCE(error);
+ } else if (flags & XFS_DIO_FLAG_APPEND) {
+ struct xfs_trans *tp;
- if (size <= 0)
- return;
+ trace_xfs_end_io_direct_write_append(ip, offset, size);
- trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
- ioend ? ioend->io_type : 0, NULL);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- if (!ioend) {
- ASSERT(offset + size <= i_size_read(inode));
- return;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp);
+ WARN_ON_ONCE(error);
+ return;
+ }
+ error = xfs_setfilesize(ip, tp, offset, size);
+ WARN_ON_ONCE(error);
}
-
- __xfs_end_io_direct_write(inode, ioend, offset, size);
}
static inline ssize_t
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 391d797..c8d5842 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1296,11 +1296,7 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1340,6 +1336,9 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
--
2.1.4
_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs
Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/xfs/xfs_aops.c | 39 ++++++++++++++-------------------------
1 file changed, 14 insertions(+), 25 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f6b08ea..e3cb7f8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1599,41 +1599,30 @@ xfs_end_io_direct_write(
}
}
-static inline ssize_t
-xfs_vm_do_dio(
- struct inode *inode,
+STATIC ssize_t
+xfs_vm_direct_IO(
struct kiocb *iocb,
struct iov_iter *iter,
- loff_t offset,
- void (*endio)(struct kiocb *iocb,
- loff_t offset,
- ssize_t size,
- void *private),
- int flags)
+ loff_t offset)
{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ dio_iodone_t *endio = NULL;
+ int flags = 0;
struct block_device *bdev;
- if (IS_DAX(inode))
+ if (iov_iter_rw(iter) == WRITE) {
+ endio = xfs_end_io_direct_write;
+ flags = DIO_ASYNC_EXTEND;
+ }
+
+ if (IS_DAX(inode)) {
return dax_do_io(iocb, inode, iter, offset,
xfs_get_blocks_direct, endio, 0);
+ }
bdev = xfs_find_bdev_for_inode(inode);
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
- xfs_get_blocks_direct, endio, NULL, flags);
-}
-
-STATIC ssize_t
-xfs_vm_direct_IO(
- struct kiocb *iocb,
- struct iov_iter *iter,
- loff_t offset)
-{
- struct inode *inode = iocb->ki_filp->f_mapping->host;
-
- if (iov_iter_rw(iter) == WRITE)
- return xfs_vm_do_dio(inode, iocb, iter, offset,
- xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
- return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
+ xfs_get_blocks_direct, endio, NULL, flags);
}
/*
--
2.1.4
_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs
On Tue, Feb 02, 2016 at 09:17:35PM +0100, Christoph Hellwig wrote:
> See http://www.infradead.org/rpr.html
Ick. ^^^
> This way we can pass back errors to the file system, and allow for
> cleanup required for all direct I/O invocations.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 2 +-
> fs/direct-io.c | 4 ++--
> fs/ext4/inode.c | 3 +++
> fs/ocfs2/aops.c | 3 +++
> fs/xfs/xfs_aops.c | 3 +++
> 5 files changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 4fd6b0c..e47d1ba 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -267,7 +267,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
> if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
> inode_unlock(inode);
>
> - if ((retval > 0) && end_io)
> + if (end_io)
> end_io(iocb, pos, retval, bh.b_private);
>
> if (!(flags & DIO_SKIP_DIO_COUNT))
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 1b2f7ff..ead7ac1 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -253,8 +253,8 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
> if (ret == 0)
> ret = transferred;
>
> - if (dio->end_io && dio->result)
> - dio->end_io(dio->iocb, offset, transferred, dio->private);
> + if (dio->end_io)
> + dio->end_io(dio->iocb, offset, ret, dio->private);
Could we make end_io return an int so that errors during completion can be
stuffed into ret to be picked up by whatever's calling directio? Something
like this:
if (dio->end_io) {
int ret2;
ret2 = dio->end_io(dio->iocb, offset, ret, dio->private);
if (ret2 && !ret)
ret = ret2;
}
That way I can capture IO errors during the CoW remapping step and pass them to
userland either via dio_complete()'s return value or through ki_complete.
(If ret itself is an error code then obviously we don't bother with the
post-CoW remap.)
--D
>
> if (!(dio->flags & DIO_SKIP_DIO_COUNT))
> inode_dio_end(dio->inode);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 83bc8bf..d04555b 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3166,6 +3166,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> {
> ext4_io_end_t *io_end = iocb->private;
>
> + if (size <= 0)
> + return;
> +
> /* if not async direct IO just return */
> if (!io_end)
> return;
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index 794fd15..1eeb804 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -628,6 +628,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
> struct inode *inode = file_inode(iocb->ki_filp);
> int level;
>
> + if (bytes <= 0)
> + return;
> +
> /* this io's submitter should not have unlocked this before we could */
> BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
>
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 379c089..c318e9f 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -1655,6 +1655,9 @@ xfs_end_io_direct_write(
> struct inode *inode = file_inode(iocb->ki_filp);
> struct xfs_ioend *ioend = private;
>
> + if (size <= 0)
> + return;
> +
> trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
> ioend ? ioend->io_type : 0, NULL);
>
> --
> 2.1.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Feb 02, 2016 at 09:17:37PM +0100, Christoph Hellwig wrote:
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
Reviewed-by: Brian Foster <[email protected]>
> fs/xfs/xfs_aops.c | 39 ++++++++++++++-------------------------
> 1 file changed, 14 insertions(+), 25 deletions(-)
>
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index f6b08ea..e3cb7f8 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -1599,41 +1599,30 @@ xfs_end_io_direct_write(
> }
> }
>
> -static inline ssize_t
> -xfs_vm_do_dio(
> - struct inode *inode,
> +STATIC ssize_t
> +xfs_vm_direct_IO(
> struct kiocb *iocb,
> struct iov_iter *iter,
> - loff_t offset,
> - void (*endio)(struct kiocb *iocb,
> - loff_t offset,
> - ssize_t size,
> - void *private),
> - int flags)
> + loff_t offset)
> {
> + struct inode *inode = iocb->ki_filp->f_mapping->host;
> + dio_iodone_t *endio = NULL;
> + int flags = 0;
> struct block_device *bdev;
>
> - if (IS_DAX(inode))
> + if (iov_iter_rw(iter) == WRITE) {
> + endio = xfs_end_io_direct_write;
> + flags = DIO_ASYNC_EXTEND;
> + }
> +
> + if (IS_DAX(inode)) {
> return dax_do_io(iocb, inode, iter, offset,
> xfs_get_blocks_direct, endio, 0);
> + }
>
> bdev = xfs_find_bdev_for_inode(inode);
> return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
> - xfs_get_blocks_direct, endio, NULL, flags);
> -}
> -
> -STATIC ssize_t
> -xfs_vm_direct_IO(
> - struct kiocb *iocb,
> - struct iov_iter *iter,
> - loff_t offset)
> -{
> - struct inode *inode = iocb->ki_filp->f_mapping->host;
> -
> - if (iov_iter_rw(iter) == WRITE)
> - return xfs_vm_do_dio(inode, iocb, iter, offset,
> - xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
> - return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
> + xfs_get_blocks_direct, endio, NULL, flags);
> }
>
> /*
> --
> 2.1.4
>
> _______________________________________________
> xfs mailing list
> [email protected]
> http://oss.sgi.com/mailman/listinfo/xfs
On Tue, Feb 02, 2016 at 09:17:36PM +0100, Christoph Hellwig wrote:
> We only need to communicate two bits of information to the direct I/O
> completion handler:
>
> (1) do we need to convert any unwritten extents in the range
> (2) do we need to check if we need to update the inode size based
> on the range passed to the completion handler
>
> We can use the private data passed to the get_block handler and the
> completion handler as a simple bitmask to communicate this information
> instead of the current complicated infrastructure reusing the ioends
> from the buffer I/O path, and thus avoiding a memory allocation and
> a context switch for any non-trivial direct write. As a nice side
> effect we also decouple the direct I/O path implementation from that
> of the buffered I/O path.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/xfs/xfs_aops.c | 216 ++++++++++++++++++-----------------------------------
> fs/xfs/xfs_trace.h | 9 +--
> 2 files changed, 77 insertions(+), 148 deletions(-)
>
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index c318e9f..f6b08ea 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
...
> @@ -1570,42 +1516,50 @@ xfs_get_blocks_dax_fault(
> return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
> }
>
> -static void
> -__xfs_end_io_direct_write(
> - struct inode *inode,
> - struct xfs_ioend *ioend,
> +/*
> + * Complete a direct I/O write request.
> + *
> + * xfs_map_direct passes us some flags in the private data to tell us what to
> + * do. If not flags are set, then the write IO is an overwrite wholly within
"If no flags ..."
Otherwise, looks fine:
Reviewed-by: Brian Foster <[email protected]>
> + * the existing allocated file size and so there is nothing for us to do.
> + *
> + * Note that in this case the completion can be called in interrupt context,
> + * whereas if we have flags set we will always be called in task context
> + * (i.e. from a workqueue).
> + */
> +STATIC void
> +xfs_end_io_direct_write(
> + struct kiocb *iocb,
> loff_t offset,
> - ssize_t size)
> + ssize_t size,
> + void *private)
> {
> - struct xfs_mount *mp = XFS_I(inode)->i_mount;
> + struct inode *inode = file_inode(iocb->ki_filp);
> + struct xfs_inode *ip = XFS_I(inode);
> + struct xfs_mount *mp = ip->i_mount;
> + uintptr_t flags = (uintptr_t)private;
> + int error = 0;
>
> - if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
> - goto out_end_io;
> + trace_xfs_end_io_direct_write(ip, offset, size);
>
> - /*
> - * dio completion end_io functions are only called on writes if more
> - * than 0 bytes was written.
> - */
> - ASSERT(size > 0);
> + if (XFS_FORCED_SHUTDOWN(mp))
> + return;
>
> - /*
> - * The ioend only maps whole blocks, while the IO may be sector aligned.
> - * Hence the ioend offset/size may not match the IO offset/size exactly.
> - * Because we don't map overwrites within EOF into the ioend, the offset
> - * may not match, but only if the endio spans EOF. Either way, write
> - * the IO sizes into the ioend so that completion processing does the
> - * right thing.
> - */
> - ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
> - ioend->io_size = size;
> - ioend->io_offset = offset;
> + if (size <= 0)
> + return;
>
> /*
> - * The ioend tells us whether we are doing unwritten extent conversion
> + * The flags tell us whether we are doing unwritten extent conversion
> * or an append transaction that updates the on-disk file size. These
> * cases are the only cases where we should *potentially* be needing
> * to update the VFS inode size.
> - *
> + */
> + if (flags == 0) {
> + ASSERT(offset + size <= i_size_read(inode));
> + return;
> + }
> +
> + /*
> * We need to update the in-core inode size here so that we don't end up
> * with the on-disk inode size being outside the in-core inode size. We
> * have no other method of updating EOF for AIO, so always do it here
> @@ -1616,57 +1570,33 @@ __xfs_end_io_direct_write(
> * here can result in EOF moving backwards and Bad Things Happen when
> * that occurs.
> */
> - spin_lock(&XFS_I(inode)->i_flags_lock);
> + spin_lock(&ip->i_flags_lock);
> if (offset + size > i_size_read(inode))
> i_size_write(inode, offset + size);
> - spin_unlock(&XFS_I(inode)->i_flags_lock);
> + spin_unlock(&ip->i_flags_lock);
>
> - /*
> - * If we are doing an append IO that needs to update the EOF on disk,
> - * do the transaction reserve now so we can use common end io
> - * processing. Stashing the error (if there is one) in the ioend will
> - * result in the ioend processing passing on the error if it is
> - * possible as we can't return it from here.
> - */
> - if (ioend->io_type == XFS_IO_OVERWRITE)
> - ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
> + if (flags & XFS_DIO_FLAG_UNWRITTEN) {
> + trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
>
> -out_end_io:
> - xfs_end_io(&ioend->io_work);
> - return;
> -}
> + error = xfs_iomap_write_unwritten(ip, offset, size);
>
> -/*
> - * Complete a direct I/O write request.
> - *
> - * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
> - * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
> - * wholly within the EOF and so there is nothing for us to do. Note that in this
> - * case the completion can be called in interrupt context, whereas if we have an
> - * ioend we will always be called in task context (i.e. from a workqueue).
> - */
> -STATIC void
> -xfs_end_io_direct_write(
> - struct kiocb *iocb,
> - loff_t offset,
> - ssize_t size,
> - void *private)
> -{
> - struct inode *inode = file_inode(iocb->ki_filp);
> - struct xfs_ioend *ioend = private;
> + WARN_ON_ONCE(error);
> + } else if (flags & XFS_DIO_FLAG_APPEND) {
> + struct xfs_trans *tp;
>
> - if (size <= 0)
> - return;
> + trace_xfs_end_io_direct_write_append(ip, offset, size);
>
> - trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
> - ioend ? ioend->io_type : 0, NULL);
> + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
>
> - if (!ioend) {
> - ASSERT(offset + size <= i_size_read(inode));
> - return;
> + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
> + if (error) {
> + xfs_trans_cancel(tp);
> + WARN_ON_ONCE(error);
> + return;
> + }
> + error = xfs_setfilesize(ip, tp, offset, size);
> + WARN_ON_ONCE(error);
> }
> -
> - __xfs_end_io_direct_write(inode, ioend, offset, size);
> }
>
> static inline ssize_t
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 391d797..c8d5842 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1296,11 +1296,7 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
> DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
> DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
> DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
> -DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
> -DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
> -DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
> -DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
> -DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
> +DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
>
> DECLARE_EVENT_CLASS(xfs_simple_io_class,
> TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
> @@ -1340,6 +1336,9 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
> DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
> DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
> DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
> +DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
> +DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
> +DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
>
> DECLARE_EVENT_CLASS(xfs_itrunc_class,
> TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
> --
> 2.1.4
>
> _______________________________________________
> xfs mailing list
> [email protected]
> http://oss.sgi.com/mailman/listinfo/xfs
> > - if (dio->end_io && dio->result)
> > - dio->end_io(dio->iocb, offset, transferred, dio->private);
> > + if (dio->end_io)
> > + dio->end_io(dio->iocb, offset, ret, dio->private);
>
> Could we make end_io return an int so that errors during completion can be
> stuffed into ret to be picked up by whatever's calling directio? Something
> like this:
>
> if (dio->end_io) {
> int ret2;
>
> ret2 = dio->end_io(dio->iocb, offset, ret, dio->private);
> if (ret2 && !ret)
> ret = ret2;
> }
>
> That way I can capture IO errors during the CoW remapping step and pass them to
> userland either via dio_complete()'s return value or through ki_complete.
>
> (If ret itself is an error code then obviously we don't bother with the
> post-CoW remap.)
Should be doable, I'll respin it with that change.