2020-07-08 21:20:27

by Goldwyn Rodrigues

[permalink] [raw]
Subject: [PATCH 0/6 v11] btrfs direct-io using iomap

This is an effort to use iomap for direct I/O in btrfs. This would
change the call from __blockdev_direct_io() to iomap_dio_rw().
These are remanants of the series which was revoked due to page
invalidation errors and adds patches in iomap for proper buffered
fallback.

The main objective is to lose the buffer head and use bio defined by
iomap code, and to use more of generic-FS codebase.

These patches are based and tested on vanilla. I have tested it against
xfstests.

The tree is available at
https://github.com/goldwynr/linux/tree/btrfs-iomap-dio

Changes since v1
- Incorporated back the efficiency change for inode locking
- Review comments about coding style and git comments
- Merge related patches into one
- Direct read to go through btrfs_direct_IO()
- Removal of no longer used function dio_end_io()

Changes since v2
- aligning iomap offset/length to the position/length of I/O
- Removed btrfs_dio_data
- Removed BTRFS_INODE_READDIO_NEED_LOCK
- Re-incorporating write efficiency changes caused lockdep_assert() in
iomap to be triggered, remove that code.

Changes since v3
- Fixed freeze on generic/095. Use iomap_end() to account for
failed/incomplete dio instead of btrfs_dio_data

Changes since v4
- moved lockdep_assert_held() to functions calling iomap_dio_rw()
This may be called immidiately after calling inode lock and
may feel not required, but it seems important.
- Removed comments which are no longer required
- Changed commit comments to make them more appropriate

Changes since v5
- restore inode_dio_wait() in truncate
- Removed lockdep_assert_held() near callers
Changes since v6
- Fixed hangs due to underlying device failures
- Removed the patch to wait while releasing pages

Changes since v7
- Moved reservation into btrfs iomap begin()/end() for correct
reservation cleanup in case of device error.
- Merged patches switch to iomap, and adding btrfs_iomap_end()
for easier bisection. The switch to iomap would fail in case
of dio after buffered writes.

Changes since v8
- Added a flag to iomap_dio_rw() to return zero for buffered fallback

Changes since v9
- flag name changes for iomap_dio_rw()

Changes since v10
- flag datatype changed to unsigned int

fs/btrfs/Kconfig | 1
fs/btrfs/btrfs_inode.h | 18 --
fs/btrfs/ctree.h | 5
fs/btrfs/file.c | 116 ++++++++++++--
fs/btrfs/inode.c | 402 ++++++++++++++++++-------------------------------
fs/direct-io.c | 19 --
fs/ext4/file.c | 11 +
fs/gfs2/file.c | 14 +
fs/iomap/direct-io.c | 11 +
fs/xfs/xfs_file.c | 15 +
fs/zonefs/super.c | 16 +
include/linux/fs.h | 2
include/linux/iomap.h | 25 ++-
13 files changed, 335 insertions(+), 320 deletions(-)

--
Goldwyn




2020-07-08 21:20:31

by Goldwyn Rodrigues

[permalink] [raw]
Subject: [PATCH 1/6] iomap: Convert wait_for_completion to flags

From: Goldwyn Rodrigues <[email protected]>

Convert wait_for_completion boolean to flags so we can pass more flags
to iomap_dio_rw()

Signed-off-by: Goldwyn Rodrigues <[email protected]>
---
fs/ext4/file.c | 11 +++++++++--
fs/gfs2/file.c | 14 ++++++++++----
fs/iomap/direct-io.c | 3 ++-
fs/xfs/xfs_file.c | 15 +++++++++++----
fs/zonefs/super.c | 16 ++++++++++++----
include/linux/iomap.h | 11 ++++++++++-
6 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2a01e31a032c..8f6324eb6b27 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -53,6 +53,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t ret;
struct inode *inode = file_inode(iocb->ki_filp);
+ unsigned int flags = 0;

if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock_shared(inode))
@@ -74,8 +75,11 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}

+ if (is_sync_kiocb(iocb))
+ flags |= IOMAP_DIO_RWF_SYNCIO;
+
ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+ flags);
inode_unlock_shared(inode);

file_accessed(iocb->ki_filp);
@@ -457,6 +461,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
bool extend = false, unaligned_io = false;
bool ilock_shared = true;
+ unsigned int flags = 0;

/*
* We initially start with shared inode lock unless it is
@@ -540,10 +545,12 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
ext4_journal_stop(handle);
}

+ if (is_sync_kiocb(iocb) || unaligned_io || extend)
+ flags |= IOMAP_DIO_RWF_SYNCIO;
if (ilock_shared)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
- is_sync_kiocb(iocb) || unaligned_io || extend);
+ flags);

if (extend)
ret = ext4_handle_inode_extension(inode, offset, ret, count);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fe305e4bfd37..68f4ee4a20ee 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -767,6 +767,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to)
size_t count = iov_iter_count(to);
struct gfs2_holder gh;
ssize_t ret;
+ unsigned int flags = 0;

if (!count)
return 0; /* skip atime */
@@ -776,8 +777,10 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to)
if (ret)
goto out_uninit;

- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+ if (is_sync_kiocb(iocb))
+ flags |= IOMAP_DIO_RWF_SYNCIO;
+
+ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, flags);

gfs2_glock_dq(&gh);
out_uninit:
@@ -794,6 +797,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
loff_t offset = iocb->ki_pos;
struct gfs2_holder gh;
ssize_t ret;
+ unsigned int flags = 0;

/*
* Deferred lock, even if its a write, since we do no allocation on
@@ -812,8 +816,10 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (offset + len > i_size_read(&ip->i_inode))
goto out;

- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+ if (is_sync_kiocb(iocb))
+ flags |= IOMAP_DIO_RWF_SYNCIO;
+
+ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, flags);

out:
gfs2_glock_dq(&gh);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index ec7b78e6feca..2753b7022403 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -405,7 +405,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- bool wait_for_completion)
+ unsigned int dio_flags)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
@@ -415,6 +415,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
unsigned int flags = IOMAP_DIRECT;
struct blk_plug plug;
struct iomap_dio *dio;
+ bool wait_for_completion = dio_flags & IOMAP_DIO_RWF_SYNCIO;

if (!count)
return 0;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 00db81eac80d..6a7edb2c3167 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -169,6 +169,8 @@ xfs_file_dio_aio_read(
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
size_t count = iov_iter_count(to);
ssize_t ret;
+ unsigned int flags = 0;
+

trace_xfs_file_direct_read(ip, count, iocb->ki_pos);

@@ -183,8 +185,11 @@ xfs_file_dio_aio_read(
} else {
xfs_ilock(ip, XFS_IOLOCK_SHARED);
}
- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+
+ if (is_sync_kiocb(iocb))
+ flags |= IOMAP_DIO_RWF_SYNCIO;
+
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, flags);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);

return ret;
@@ -483,6 +488,7 @@ xfs_file_dio_aio_write(
int iolock;
size_t count = iov_iter_count(from);
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ unsigned int flags = 0;

/* DIO must be aligned to device logical sector size */
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
@@ -546,9 +552,10 @@ xfs_file_dio_aio_write(
* If unaligned, this is the only IO in-flight. Wait on it before we
* release the iolock to prevent subsequent overlapping IO.
*/
+ if (is_sync_kiocb(iocb) || unaligned_io)
+ flags |= IOMAP_DIO_RWF_SYNCIO;
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops,
- is_sync_kiocb(iocb) || unaligned_io);
+ &xfs_dio_write_ops, flags);
out:
xfs_iunlock(ip, iolock);

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 07bc42d62673..798e2e636887 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -670,6 +670,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
bool append = false;
size_t count;
ssize_t ret;
+ int flags = 0;

/*
* For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
@@ -711,11 +712,15 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
append = sync;
}

- if (append)
+ if (append) {
ret = zonefs_file_dio_append(iocb, from);
- else
+ } else {
+ if (is_sync_kiocb(iocb))
+ flags |= IOMAP_DIO_RWF_SYNCIO;
+
ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
- &zonefs_write_dio_ops, sync);
+ &zonefs_write_dio_ops, flags);
+ }
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
@@ -814,6 +819,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct super_block *sb = inode->i_sb;
loff_t isize;
ssize_t ret;
+ int flags = 0;

/* Offline zones cannot be read */
if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
@@ -848,8 +854,10 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
goto inode_unlock;
}
file_accessed(iocb->ki_filp);
+ if (is_sync_kiocb(iocb))
+ flags |= IOMAP_DIO_RWF_SYNCIO;
ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
- &zonefs_read_dio_ops, is_sync_kiocb(iocb));
+ &zonefs_read_dio_ops, flags);
} else {
ret = generic_file_read_iter(iocb, to);
if (ret == -EIO)
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4d1d3c3469e9..80cd5f524124 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -255,9 +255,18 @@ struct iomap_dio_ops {
struct bio *bio, loff_t file_offset);
};

+/*
+ * Flags to pass iomap_dio_rw()
+ */
+
+/*
+ * Wait for completion of DIO
+ */
+#define IOMAP_DIO_RWF_SYNCIO (1 << 0)
+
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- bool wait_for_completion);
+ unsigned int flags);
int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);

#ifdef CONFIG_SWAP
--
2.26.2

2020-07-08 21:20:38

by Goldwyn Rodrigues

[permalink] [raw]
Subject: [PATCH 2/6] iomap: IOMAP_DIO_RWF_NO_STALE_PAGECACHE return if page invalidation fails

From: Goldwyn Rodrigues <[email protected]>

For direct I/O, add the flag IOMAP_DIO_RWF_NO_STALE_PAGECACHE to indicate
that if the page invalidation fails, return back control to the
filesystem so it may fallback to buffered mode.

Reviewed-by: Darrick J. Wong <[email protected]>
Signed-off-by: Goldwyn Rodrigues <[email protected]>
---
fs/iomap/direct-io.c | 8 +++++++-
include/linux/iomap.h | 14 ++++++++++++++
2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 2753b7022403..66becf935865 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -484,8 +484,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
*/
ret = invalidate_inode_pages2_range(mapping,
pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
- if (ret)
+ if (ret) {
+ if (dio_flags & IOMAP_DIO_RWF_NO_STALE_PAGECACHE) {
+ if (ret == -EBUSY)
+ ret = 0;
+ goto out_free_dio;
+ }
dio_warn_stale_pagecache(iocb->ki_filp);
+ }
ret = 0;

if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 80cd5f524124..a68705369a2c 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -262,7 +262,21 @@ struct iomap_dio_ops {
/*
* Wait for completion of DIO
*/
+
#define IOMAP_DIO_RWF_SYNCIO (1 << 0)
+/*
+ * Direct IO will attempt to keep the page cache coherent by
+ * invalidating the inode's page cache over the range of the DIO.
+ * That can fail if something else is actively using the page cache.
+ * If this happens and the DIO continues, the data in the page
+ * cache will become stale.
+ *
+ * Set this flag if you want the DIO to abort without issuing any IO
+ * or error if it fails to invalidate the page cache successfully.
+ * This allows the IO submitter to fallback to buffered IO to resubmit
+ * IO
+ */
+#define IOMAP_DIO_RWF_NO_STALE_PAGECACHE (1 << 1)

ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
--
2.26.2

2020-07-08 21:21:15

by Goldwyn Rodrigues

[permalink] [raw]
Subject: [PATCH 5/6] btrfs: remove BTRFS_INODE_READDIO_NEED_LOCK

From: Goldwyn Rodrigues <[email protected]>

Since we now perform direct reads using i_rwsem, we can remove this
inode flag used to co-ordinate unlocked reads.

The truncate call takes i_rwsem. This means it is correctly synchronized
with concurrent direct reads.

Reviewed-by: Nikolay Borisov <[email protected]>
Reviewed-by: Johannes Thumshirn <[email protected]>
Signed-off-by: Goldwyn Rodrigues <[email protected]>
---
fs/btrfs/btrfs_inode.h | 18 ------------------
fs/btrfs/inode.c | 3 ---
2 files changed, 21 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index e7d709505cb1..aeff56a0e105 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -28,7 +28,6 @@ enum {
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
BTRFS_INODE_IN_DELALLOC_LIST,
- BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
};
@@ -313,23 +312,6 @@ struct btrfs_dio_private {
u8 csums[];
};

-/*
- * Disable DIO read nolock optimization, so new dio readers will be forced
- * to grab i_mutex. It is used to avoid the endless truncate due to
- * nonlocked dio read.
- */
-static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
-{
- set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
- smp_mb();
-}
-
-static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
-{
- smp_mb__before_atomic();
- clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
-}
-
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0fa75af35a1f..264b676ebf29 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4835,10 +4835,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)

truncate_setsize(inode, newsize);

- /* Disable nonlocked read DIO to avoid the endless truncate */
- btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
inode_dio_wait(inode);
- btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));

ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
--
2.26.2

2020-07-08 21:21:24

by Goldwyn Rodrigues

[permalink] [raw]
Subject: [PATCH 4/6] fs: remove dio_end_io()

From: Goldwyn Rodrigues <[email protected]>

Since we removed the last user of dio_end_io(), remove the helper
function dio_end_io().

Reviewed-by: Nikolay Borisov <[email protected]>
Reviewed-by: Johannes Thumshirn <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Goldwyn Rodrigues <[email protected]>
---
fs/direct-io.c | 19 -------------------
include/linux/fs.h | 2 --
2 files changed, 21 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 6d5370eac2a8..1543b5af400e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -386,25 +386,6 @@ static void dio_bio_end_io(struct bio *bio)
spin_unlock_irqrestore(&dio->bio_lock, flags);
}

-/**
- * dio_end_io - handle the end io action for the given bio
- * @bio: The direct io bio thats being completed
- *
- * This is meant to be called by any filesystem that uses their own dio_submit_t
- * so that the DIO specific endio actions are dealt with after the filesystem
- * has done it's completion work.
- */
-void dio_end_io(struct bio *bio)
-{
- struct dio *dio = bio->bi_private;
-
- if (dio->is_async)
- dio_bio_end_aio(bio);
- else
- dio_bio_end_io(bio);
-}
-EXPORT_SYMBOL_GPL(dio_end_io);
-
static inline void
dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
struct block_device *bdev,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3f881a892ea7..9b3f250d634c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3202,8 +3202,6 @@ enum {
DIO_SKIP_HOLES = 0x02,
};

-void dio_end_io(struct bio *bio);
-
ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, struct iov_iter *iter,
get_block_t get_block,
--
2.26.2

2020-07-08 21:21:37

by Goldwyn Rodrigues

[permalink] [raw]
Subject: [PATCH 6/6] btrfs: split btrfs_direct_IO to read and write part

From: Goldwyn Rodrigues <[email protected]>

The read and write versions don't have anything in common except for the
call to iomap_dio_rw. So split this function, and merge each half into
its only caller.

Originally proposed by Christoph Hellwig <[email protected]>

Signed-off-by: Goldwyn Rodrigues <[email protected]>
---
fs/btrfs/ctree.h | 4 +-
fs/btrfs/file.c | 95 +++++++++++++++++++++++++++++++++++++++++++-----
fs/btrfs/inode.c | 82 +----------------------------------------
3 files changed, 90 insertions(+), 91 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 677f170434e3..1037969cda63 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -28,6 +28,7 @@
#include <linux/dynamic_debug.h>
#include <linux/refcount.h>
#include <linux/crc32c.h>
+#include <linux/iomap.h>
#include "extent-io-tree.h"
#include "extent_io.h"
#include "extent_map.h"
@@ -2935,7 +2936,8 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
-ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
+extern const struct iomap_ops btrfs_dio_iomap_ops;
+extern const struct iomap_dio_ops btrfs_dops;

/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9d486350f1bf..8c738f8101c4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1825,21 +1825,65 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
return num_written ? num_written : ret;
}

-static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ const unsigned int blocksize_mask = fs_info->sectorsize - 1;
+
+ if (offset & blocksize_mask)
+ return -EINVAL;
+
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- loff_t pos;
- ssize_t written;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ loff_t pos = iocb->ki_pos;
+ ssize_t written = 0;
ssize_t written_buffered;
loff_t endbyte;
int err;
+ size_t count = 0;
+ bool relock = false;
+ int flags = IOMAP_DIO_RWF_NO_STALE_PAGECACHE;

- written = btrfs_direct_IO(iocb, from);
+ if (check_direct_IO(fs_info, from, pos))
+ goto buffered;
+
+ count = iov_iter_count(from);
+ /*
+ * If the write DIO is beyond the EOF, we need update the isize, but it
+ * is protected by i_mutex. So we can not unlock the i_mutex at this
+ * case.
+ */
+ if (pos + count <= inode->i_size) {
+ inode_unlock(inode);
+ relock = true;
+ } else if (iocb->ki_flags & IOCB_NOWAIT) {
+ return -EAGAIN;
+ }
+
+ if (is_sync_kiocb(iocb))
+ flags |= IOMAP_DIO_RWF_SYNCIO;
+
+ down_read(&BTRFS_I(inode)->dio_sem);
+ written = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dops,
+ flags);
+ up_read(&BTRFS_I(inode)->dio_sem);
+
+ if (relock)
+ inode_lock(inode);

if (written < 0 || !iov_iter_count(from))
return written;

+buffered:
pos = iocb->ki_pos;
written_buffered = btrfs_buffered_write(iocb, from);
if (written_buffered < 0) {
@@ -1990,7 +2034,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
atomic_inc(&BTRFS_I(inode)->sync_writers);

if (iocb->ki_flags & IOCB_DIRECT) {
- num_written = __btrfs_direct_write(iocb, from);
+ num_written = btrfs_direct_write(iocb, from);
} else {
num_written = btrfs_buffered_write(iocb, from);
if (num_written > 0)
@@ -3504,16 +3548,47 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp);
}

+static int check_direct_read(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ int ret;
+ int i, seg;
+
+ ret = check_direct_IO(fs_info, iter, offset);
+ if (ret < 0)
+ return ret;
+
+ for (seg = 0; seg < iter->nr_segs; seg++)
+ for (i = seg + 1; i < iter->nr_segs; i++)
+ if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
+ return -EINVAL;
+ return 0;
+}
+
+static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+ int flags = IOMAP_DIO_RWF_NO_STALE_PAGECACHE;
+
+ if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
+ return 0;
+
+ if (is_sync_kiocb(iocb))
+ flags |= IOMAP_DIO_RWF_SYNCIO;
+
+ inode_lock_shared(inode);
+ ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dops, flags);
+ inode_unlock_shared(inode);
+ return ret;
+}
+
static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t ret = 0;

if (iocb->ki_flags & IOCB_DIRECT) {
- struct inode *inode = file_inode(iocb->ki_filp);
-
- inode_lock_shared(inode);
- ret = btrfs_direct_IO(iocb, to);
- inode_unlock_shared(inode);
+ ret = btrfs_direct_read(iocb, to);
if (ret < 0)
return ret;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 264b676ebf29..864415a17b1f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -29,7 +29,6 @@
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
-#include <linux/iomap.h>
#include <asm/unaligned.h>
#include "misc.h"
#include "ctree.h"
@@ -7844,92 +7843,15 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
return BLK_QC_T_NONE;
}

-static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
- const struct iov_iter *iter, loff_t offset)
-{
- int seg;
- int i;
- unsigned int blocksize_mask = fs_info->sectorsize - 1;
- ssize_t retval = -EINVAL;
-
- if (offset & blocksize_mask)
- goto out;
-
- if (iov_iter_alignment(iter) & blocksize_mask)
- goto out;
-
- /* If this is a write we don't need to check anymore */
- if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
- return 0;
- /*
- * Check to make sure we don't have duplicate iov_base's in this
- * iovec, if so return EINVAL, otherwise we'll get csum errors
- * when reading back.
- */
- for (seg = 0; seg < iter->nr_segs; seg++) {
- for (i = seg + 1; i < iter->nr_segs; i++) {
- if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
- goto out;
- }
- }
- retval = 0;
-out:
- return retval;
-}
-
-static const struct iomap_ops btrfs_dio_iomap_ops = {
+const struct iomap_ops btrfs_dio_iomap_ops = {
.iomap_begin = btrfs_dio_iomap_begin,
.iomap_end = btrfs_dio_iomap_end,
};

-static const struct iomap_dio_ops btrfs_dops = {
+const struct iomap_dio_ops btrfs_dops = {
.submit_io = btrfs_submit_direct,
};

-ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_changeset *data_reserved = NULL;
- loff_t offset = iocb->ki_pos;
- size_t count = 0;
- bool relock = false;
- ssize_t ret;
- int flags = IOMAP_DIO_RWF_NO_STALE_PAGECACHE;
-
- if (check_direct_IO(fs_info, iter, offset))
- return 0;
-
- count = iov_iter_count(iter);
- if (iov_iter_rw(iter) == WRITE) {
- /*
- * If the write DIO is beyond the EOF, we need update
- * the isize, but it is protected by i_mutex. So we can
- * not unlock the i_mutex at this case.
- */
- if (offset + count <= inode->i_size) {
- inode_unlock(inode);
- relock = true;
- }
- down_read(&BTRFS_I(inode)->dio_sem);
- }
-
- if (is_sync_kiocb(iocb))
- flags |= IOMAP_DIO_RWF_SYNCIO;
-
- ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dops,
- flags);
-
- if (iov_iter_rw(iter) == WRITE) {
- up_read(&BTRFS_I(inode)->dio_sem);
- }
- if (relock)
- inode_lock(inode);
- extent_changeset_free(data_reserved);
- return ret;
-}
-
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
--
2.26.2

2020-07-08 21:22:47

by Goldwyn Rodrigues

[permalink] [raw]
Subject: [PATCH 3/6] btrfs: switch to iomap_dio_rw() for dio

From: Goldwyn Rodrigues <[email protected]>

Switch from __blockdev_direct_IO() to iomap_dio_rw().
Rename btrfs_get_blocks_direct() to btrfs_dio_iomap_begin() and use it
as iomap_begin() for iomap direct I/O functions. This function
allocates and locks all the blocks required for the I/O.
btrfs_submit_direct() is used as the submit_io() hook for direct I/O
ops.

Since we need direct I/O reads to go through iomap_dio_rw(), we change
file_operations.read_iter() to a btrfs_file_read_iter() which calls
btrfs_direct_IO() for direct reads and falls back to
generic_file_buffered_read() for incomplete reads and buffered reads.

We don't need address_space.direct_IO() anymore so set it to noop.
Similarly, we don't need flags used in __blockdev_direct_IO(). iomap is
capable of direct I/O reads from a hole, so we don't need to return
-ENOENT.

BTRFS direct I/O is now done under i_rwsem, shared in case of reads and
exclusive in case of writes. This guards against simultaneous truncates.

Use iomap->iomap_end() to check for failed or incomplete direct I/O:
- for writes, call __endio_write_update_ordered()
- for reads, unlock extents

btrfs_dio_data is now hooked in iomap->private and not
current->journal_info. It carries the reservation variable and the
amount of data submitted, so we can calculate the amount of data to call
__endio_write_update_ordered in case of an error.

This patch removes last use of struct buffer_head from btrfs.

Signed-off-by: Goldwyn Rodrigues <[email protected]>
---
fs/btrfs/Kconfig | 1 +
fs/btrfs/ctree.h | 1 +
fs/btrfs/file.c | 21 +++-
fs/btrfs/inode.c | 317 ++++++++++++++++++++++-------------------------
4 files changed, 170 insertions(+), 170 deletions(-)

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 575636f6491e..68b95ad82126 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -14,6 +14,7 @@ config BTRFS_FS
select LZO_DECOMPRESS
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
+ select FS_IOMAP
select RAID6_PQ
select XOR_BLOCKS
select SRCU
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d404cce8ae40..677f170434e3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2935,6 +2935,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter);

/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2520605afc25..9d486350f1bf 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1835,7 +1835,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
loff_t endbyte;
int err;

- written = generic_file_direct_write(iocb, from);
+ written = btrfs_direct_IO(iocb, from);

if (written < 0 || !iov_iter_count(from))
return written;
@@ -3504,9 +3504,26 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp);
}

+static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret = 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ inode_lock_shared(inode);
+ ret = btrfs_direct_IO(iocb, to);
+ inode_unlock_shared(inode);
+ if (ret < 0)
+ return ret;
+ }
+
+ return generic_file_buffered_read(iocb, to, ret);
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = btrfs_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
.mmap = btrfs_file_mmap,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 18d384f4af54..0fa75af35a1f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5,7 +5,6 @@

#include <linux/kernel.h>
#include <linux/bio.h>
-#include <linux/buffer_head.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -30,6 +29,7 @@
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
+#include <linux/iomap.h>
#include <asm/unaligned.h>
#include "misc.h"
#include "ctree.h"
@@ -58,9 +58,9 @@ struct btrfs_iget_args {

struct btrfs_dio_data {
u64 reserve;
- u64 unsubmitted_oe_range_start;
- u64 unsubmitted_oe_range_end;
- int overwrite;
+ loff_t length;
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
};

static const struct inode_operations btrfs_dir_inode_operations;
@@ -7069,7 +7069,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
}

static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state, int writing)
+ struct extent_state **cached_state, bool writing)
{
struct btrfs_ordered_extent *ordered;
int ret = 0;
@@ -7207,30 +7207,7 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
}


-static int btrfs_get_blocks_direct_read(struct extent_map *em,
- struct buffer_head *bh_result,
- struct inode *inode,
- u64 start, u64 len)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
- if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return -ENOENT;
-
- len = min(len, em->len - (start - em->start));
-
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- return 0;
-}
-
static int btrfs_get_blocks_direct_write(struct extent_map **map,
- struct buffer_head *bh_result,
struct inode *inode,
struct btrfs_dio_data *dio_data,
u64 start, u64 len)
@@ -7292,7 +7269,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
}

/* this will cow the extent */
- len = bh_result->b_size;
free_extent_map(em);
*map = em = btrfs_new_extent_direct(inode, start, len);
if (IS_ERR(em)) {
@@ -7303,64 +7279,73 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
len = min(len, em->len - (start - em->start));

skip_cow:
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- set_buffer_new(bh_result);
-
/*
* Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock.
*/
- if (!dio_data->overwrite && start + len > i_size_read(inode))
+ if (start + len > i_size_read(inode))
i_size_write(inode, start + len);

- WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
- dio_data->unsubmitted_oe_range_end = start + len;
- current->journal_info = dio_data;
out:
return ret;
}

-static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ loff_t length, unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = NULL;
- u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
- u64 len = bh_result->b_size;
+ const bool write = !!(flags & IOMAP_WRITE);
int ret = 0;
+ u64 len = length;
+ bool unlock_extents = false;

- if (!create)
+ if (!write)
len = min_t(u64, len, fs_info->sectorsize);

lockstart = start;
lockend = start + len - 1;

- if (current->journal_info) {
- /*
- * Need to pull our outstanding extents and set journal_info to NULL so
- * that anything that needs to check if there's a transaction doesn't get
- * confused.
- */
- dio_data = current->journal_info;
- current->journal_info = NULL;
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which
+ * isn't enough if we've written compressed pages to this area, so we
+ * need to flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+
+ dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
+ if (!dio_data)
+ return -ENOMEM;
+
+ dio_data->length = length;
+ if (write) {
+ dio_data->reserve = round_up(length, fs_info->sectorsize);
+ ret = btrfs_delalloc_reserve_space(inode,
+ &dio_data->data_reserved,
+ start, dio_data->reserve);
+ if (ret) {
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ return ret;
+ }
}
+ iomap->private = dio_data;
+

/*
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
- create)) {
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
ret = -ENOTBLK;
goto err;
}
@@ -7392,36 +7377,48 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
goto unlock_err;
}

- if (create) {
- ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
- dio_data, start, len);
+ len = min(len, em->len - (start - em->start));
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, len);
if (ret < 0)
goto unlock_err;
-
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extents = true;
+ /* Recalc len in case the new em is smaller than requested */
+ len = min(len, em->len - (start - em->start));
} else {
- ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
- start, len);
- /* Can be negative only if we read from a hole */
- if (ret < 0) {
- ret = 0;
- free_extent_map(em);
- goto unlock_err;
- }
/*
* We need to unlock only the end area that we aren't using.
* The rest is going to be unlocked by the endio routine.
*/
- lockstart = start + bh_result->b_size;
- if (lockstart < lockend) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
- } else {
- free_extent_state(cached_state);
- }
+ lockstart = start + len;
+ if (lockstart < lockend)
+ unlock_extents = true;
}

+ if (unlock_extents)
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, &cached_state);
+ else
+ free_extent_state(cached_state);
+
+ /*
+ * Translate extent map information to iomap.
+ * We trim the extents (and move the addr) even though iomap code does
+ * that, since we have locked only the parts we are performing I/O in.
+ */
+ if ((em->block_start == EXTENT_MAP_HOLE) ||
+ (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ } else {
+ iomap->addr = em->block_start + (start - em->start);
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = start;
+ iomap->bdev = fs_info->fs_devices->latest_bdev;
+ iomap->length = len;
+
free_extent_map(em);

return 0;
@@ -7430,8 +7427,53 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
- if (dio_data)
- current->journal_info = dio_data;
+ if (dio_data) {
+ btrfs_delalloc_release_space(inode, dio_data->data_reserved,
+ start, dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ }
+ return ret;
+}
+
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+ int ret = 0;
+ struct btrfs_dio_data *dio_data = iomap->private;
+ size_t submitted = dio_data->submitted;
+ const bool write = !!(flags & IOMAP_WRITE);
+
+ if (!write && (iomap->type == IOMAP_HOLE)) {
+ /* If reading from a hole, unlock and return */
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
+ goto out;
+ }
+
+ if (submitted < length) {
+ pos += submitted;
+ length -= submitted;
+ if (write)
+ __endio_write_update_ordered(inode, pos, length, false);
+ else
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1);
+ ret = -ENOTBLK;
+ }
+
+ if (write) {
+ if (dio_data->reserve)
+ btrfs_delalloc_release_space(inode,
+ dio_data->data_reserved, pos,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
+ extent_changeset_free(dio_data->data_reserved);
+ }
+out:
+ kfree(dio_data);
+ iomap->private = NULL;
+
return ret;
}

@@ -7454,7 +7496,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
dip->logical_offset + dip->bytes - 1);
}

- dio_end_io(dip->dio_bio);
+ bio_endio(dip->dio_bio);
kfree(dip);
}

@@ -7690,24 +7732,11 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
refcount_set(&dip->refs, 1);
-
- if (write) {
- struct btrfs_dio_data *dio_data = current->journal_info;
-
- /*
- * Setting range start and end to the same value means that
- * no cleanup will happen in btrfs_direct_IO
- */
- dio_data->unsubmitted_oe_range_end = dip->logical_offset +
- dip->bytes;
- dio_data->unsubmitted_oe_range_start =
- dio_data->unsubmitted_oe_range_end;
- }
return dip;
}

-static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
- loff_t file_offset)
+static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+ struct bio *dio_bio, loff_t file_offset)
{
const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
@@ -7724,6 +7753,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
+ struct btrfs_dio_data *dio_data = iomap->private;

dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
if (!dip) {
@@ -7732,8 +7762,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
file_offset + dio_bio->bi_iter.bi_size - 1);
}
dio_bio->bi_status = BLK_STS_RESOURCE;
- dio_end_io(dio_bio);
- return;
+ bio_endio(dio_bio);
+ return BLK_QC_T_NONE;
}

if (!write && csum) {
@@ -7804,15 +7834,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
goto out_err;
}

+ dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
} while (submit_len > 0);
- return;
+ return BLK_QC_T_NONE;

out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
+ return BLK_QC_T_NONE;
}

static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
@@ -7848,37 +7880,31 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
return retval;
}

-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static const struct iomap_ops btrfs_dio_iomap_ops = {
+ .iomap_begin = btrfs_dio_iomap_begin,
+ .iomap_end = btrfs_dio_iomap_end,
+};
+
+static const struct iomap_dio_ops btrfs_dops = {
+ .submit_io = btrfs_submit_direct,
+};
+
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_dio_data dio_data = { 0 };
struct extent_changeset *data_reserved = NULL;
loff_t offset = iocb->ki_pos;
size_t count = 0;
- int flags = 0;
- bool wakeup = true;
bool relock = false;
ssize_t ret;
+ int flags = IOMAP_DIO_RWF_NO_STALE_PAGECACHE;

if (check_direct_IO(fs_info, iter, offset))
return 0;

- inode_dio_begin(inode);
-
- /*
- * The generic stuff only does filemap_write_and_wait_range, which
- * isn't enough if we've written compressed pages to this area, so
- * we need to flush the dirty pages again to make absolutely sure
- * that any outstanding dirty pages are on disk.
- */
count = iov_iter_count(iter);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_fdatawrite_range(inode->i_mapping, offset,
- offset + count - 1);
-
if (iov_iter_rw(iter) == WRITE) {
/*
* If the write DIO is beyond the EOF, we need update
@@ -7886,68 +7912,23 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* not unlock the i_mutex at this case.
*/
if (offset + count <= inode->i_size) {
- dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
}
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
- offset, count);
- if (ret)
- goto out;
-
- /*
- * We need to know how many extents we reserved so that we can
- * do the accounting properly if we go over the number we
- * originally calculated. Abuse current->journal_info for this.
- */
- dio_data.reserve = round_up(count,
- fs_info->sectorsize);
- dio_data.unsubmitted_oe_range_start = (u64)offset;
- dio_data.unsubmitted_oe_range_end = (u64)offset;
- current->journal_info = &dio_data;
down_read(&BTRFS_I(inode)->dio_sem);
- } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- &BTRFS_I(inode)->runtime_flags)) {
- inode_dio_end(inode);
- flags = DIO_LOCKING | DIO_SKIP_HOLES;
- wakeup = false;
}

- ret = __blockdev_direct_IO(iocb, inode,
- fs_info->fs_devices->latest_bdev,
- iter, btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, flags);
+ if (is_sync_kiocb(iocb))
+ flags |= IOMAP_DIO_RWF_SYNCIO;
+
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dops,
+ flags);
+
if (iov_iter_rw(iter) == WRITE) {
up_read(&BTRFS_I(inode)->dio_sem);
- current->journal_info = NULL;
- if (ret < 0 && ret != -EIOCBQUEUED) {
- if (dio_data.reserve)
- btrfs_delalloc_release_space(inode, data_reserved,
- offset, dio_data.reserve, true);
- /*
- * On error we might have left some ordered extents
- * without submitting corresponding bios for them, so
- * cleanup them up to avoid other tasks getting them
- * and waiting for them to complete forever.
- */
- if (dio_data.unsubmitted_oe_range_start <
- dio_data.unsubmitted_oe_range_end)
- __endio_write_update_ordered(inode,
- dio_data.unsubmitted_oe_range_start,
- dio_data.unsubmitted_oe_range_end -
- dio_data.unsubmitted_oe_range_start,
- false);
- } else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(inode, data_reserved,
- offset, count - (size_t)ret, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count);
}
-out:
- if (wakeup)
- inode_dio_end(inode);
if (relock)
inode_lock(inode);
-
extent_changeset_free(data_reserved);
return ret;
}
@@ -10246,7 +10227,7 @@ static const struct address_space_operations btrfs_aops = {
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
- .direct_IO = btrfs_direct_IO,
+ .direct_IO = noop_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION
--
2.26.2