Use iomap in buffer_head compat mode to write to block devices.
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/Kconfig | 1 +
block/fops.c | 33 +++++++++++++++++++++++++++++----
2 files changed, 30 insertions(+), 4 deletions(-)
diff --git a/block/Kconfig b/block/Kconfig
index 941b2dca70db73..672b08f0096ab4 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,6 +5,7 @@
menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
+ select IOMAP
select SBITMAP
help
Provide block layer support for the kernel.
diff --git a/block/fops.c b/block/fops.c
index 318247832a7bcf..7910636f8df33b 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -15,6 +15,7 @@
#include <linux/falloc.h>
#include <linux/suspend.h>
#include <linux/fs.h>
+#include <linux/iomap.h>
#include <linux/module.h>
#include "blk.h"
@@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
}
+static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+ struct block_device *bdev = I_BDEV(inode);
+ loff_t isize = i_size_read(inode);
+
+ iomap->bdev = bdev;
+ iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
+ if (WARN_ON_ONCE(iomap->offset >= isize))
+ return -EIO;
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = iomap->offset;
+ iomap->length = isize - iomap->offset;
+ iomap->flags |= IOMAP_F_BUFFER_HEAD;
+ return 0;
+}
+
+static const struct iomap_ops blkdev_iomap_ops = {
+ .iomap_begin = blkdev_iomap_begin,
+};
+
static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, blkdev_get_block, wbc);
@@ -530,6 +552,11 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
return written;
}
+static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
+}
+
/*
* Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device.
@@ -575,16 +602,14 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
iov_iter_truncate(from, size);
}
- current->backing_dev_info = bdev->bd_disk->bdi;
if (iocb->ki_flags & IOCB_DIRECT) {
ret = blkdev_direct_write(iocb, from);
if (ret >= 0 && iov_iter_count(from))
ret = direct_write_fallback(iocb, from, ret,
- generic_perform_write(iocb, from));
+ blkdev_buffered_write(iocb, from));
} else {
- ret = generic_perform_write(iocb, from);
+ ret = blkdev_buffered_write(iocb, from);
}
- current->backing_dev_info = NULL;
if (ret > 0)
ret = generic_write_sync(iocb, ret);
--
2.39.2
On 4/24/23 07:49, Christoph Hellwig wrote:
> Use iomap in buffer_head compat mode to write to block devices.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> block/Kconfig | 1 +
> block/fops.c | 33 +++++++++++++++++++++++++++++----
> 2 files changed, 30 insertions(+), 4 deletions(-)
>
> diff --git a/block/Kconfig b/block/Kconfig
> index 941b2dca70db73..672b08f0096ab4 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -5,6 +5,7 @@
> menuconfig BLOCK
> bool "Enable the block layer" if EXPERT
> default y
> + select IOMAP
> select SBITMAP
> help
> Provide block layer support for the kernel.
> diff --git a/block/fops.c b/block/fops.c
> index 318247832a7bcf..7910636f8df33b 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -15,6 +15,7 @@
> #include <linux/falloc.h>
> #include <linux/suspend.h>
> #include <linux/fs.h>
> +#include <linux/iomap.h>
> #include <linux/module.h>
> #include "blk.h"
>
> @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
> }
>
> +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> + unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
> +{
> + struct block_device *bdev = I_BDEV(inode);
> + loff_t isize = i_size_read(inode);
> +
> + iomap->bdev = bdev;
> + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
> + if (WARN_ON_ONCE(iomap->offset >= isize))
> + return -EIO;
I'm hitting this during booting:
[ 5.016324] <TASK>
[ 5.030256] iomap_iter+0x11a/0x350
[ 5.030264] iomap_readahead+0x1eb/0x2c0
[ 5.030272] read_pages+0x5d/0x220
[ 5.030279] page_cache_ra_unbounded+0x131/0x180
[ 5.030284] filemap_get_pages+0xff/0x5a0
[ 5.030292] filemap_read+0xca/0x320
[ 5.030296] ? aa_file_perm+0x126/0x500
[ 5.040216] ? touch_atime+0xc8/0x150
[ 5.040224] blkdev_read_iter+0xb0/0x150
[ 5.040228] vfs_read+0x226/0x2d0
[ 5.040234] ksys_read+0xa5/0xe0
[ 5.040238] do_syscall_64+0x5b/0x80
Maybe we should consider this patch:
diff --git a/block/fops.c b/block/fops.c
index 524b8a828aad..d202fb663f25 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -386,10 +386,13 @@ static int blkdev_iomap_begin(struct inode *inode,
loff_t offset, loff_t length,
iomap->bdev = bdev;
iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
- if (WARN_ON_ONCE(iomap->offset >= isize))
- return -EIO;
- iomap->type = IOMAP_MAPPED;
- iomap->addr = iomap->offset;
+ if (WARN_ON_ONCE(iomap->offset >= isize)) {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = iomap->offset;
+ }
iomap->length = isize - iomap->offset;
if (IS_ENABLED(CONFIG_BUFFER_HEAD))
iomap->flags |= IOMAP_F_BUFFER_HEAD;
Other that the the system seems fine.
Cheers,
Hannes
On Fri, May 19, 2023 at 04:22:01PM +0200, Hannes Reinecke wrote:
> On 4/24/23 07:49, Christoph Hellwig wrote:
> > Use iomap in buffer_head compat mode to write to block devices.
> >
> > Signed-off-by: Christoph Hellwig <[email protected]>
> > ---
> > block/Kconfig | 1 +
> > block/fops.c | 33 +++++++++++++++++++++++++++++----
> > 2 files changed, 30 insertions(+), 4 deletions(-)
> >
> > diff --git a/block/Kconfig b/block/Kconfig
> > index 941b2dca70db73..672b08f0096ab4 100644
> > --- a/block/Kconfig
> > +++ b/block/Kconfig
> > @@ -5,6 +5,7 @@
> > menuconfig BLOCK
> > bool "Enable the block layer" if EXPERT
> > default y
> > + select IOMAP
> > select SBITMAP
> > help
> > Provide block layer support for the kernel.
> > diff --git a/block/fops.c b/block/fops.c
> > index 318247832a7bcf..7910636f8df33b 100644
> > --- a/block/fops.c
> > +++ b/block/fops.c
> > @@ -15,6 +15,7 @@
> > #include <linux/falloc.h>
> > #include <linux/suspend.h>
> > #include <linux/fs.h>
> > +#include <linux/iomap.h>
> > #include <linux/module.h>
> > #include "blk.h"
> > @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> > return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
> > }
> > +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> > + unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
> > +{
> > + struct block_device *bdev = I_BDEV(inode);
> > + loff_t isize = i_size_read(inode);
> > +
> > + iomap->bdev = bdev;
> > + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
> > + if (WARN_ON_ONCE(iomap->offset >= isize))
> > + return -EIO;
>
> I'm hitting this during booting:
> [ 5.016324] <TASK>
> [ 5.030256] iomap_iter+0x11a/0x350
> [ 5.030264] iomap_readahead+0x1eb/0x2c0
> [ 5.030272] read_pages+0x5d/0x220
> [ 5.030279] page_cache_ra_unbounded+0x131/0x180
> [ 5.030284] filemap_get_pages+0xff/0x5a0
Why is filemap_get_pages() using unbounded readahead? Surely
readahead should be limited to reading within EOF....
> [ 5.030292] filemap_read+0xca/0x320
> [ 5.030296] ? aa_file_perm+0x126/0x500
> [ 5.040216] ? touch_atime+0xc8/0x150
> [ 5.040224] blkdev_read_iter+0xb0/0x150
> [ 5.040228] vfs_read+0x226/0x2d0
> [ 5.040234] ksys_read+0xa5/0xe0
> [ 5.040238] do_syscall_64+0x5b/0x80
>
> Maybe we should consider this patch:
>
> diff --git a/block/fops.c b/block/fops.c
> index 524b8a828aad..d202fb663f25 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -386,10 +386,13 @@ static int blkdev_iomap_begin(struct inode *inode,
> loff_t offset, loff_t length,
>
> iomap->bdev = bdev;
> iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
> - if (WARN_ON_ONCE(iomap->offset >= isize))
> - return -EIO;
> - iomap->type = IOMAP_MAPPED;
> - iomap->addr = iomap->offset;
> + if (WARN_ON_ONCE(iomap->offset >= isize)) {
> + iomap->type = IOMAP_HOLE;
> + iomap->addr = IOMAP_NULL_ADDR;
> + } else {
> + iomap->type = IOMAP_MAPPED;
> + iomap->addr = iomap->offset;
> + }
I think Christoph's code is correct. IMO, any attempt to read beyond
the end of the device should throw out a warning and return an
error, not silently return zeros.
If readahead is trying to read beyond the end of the device, then it
really seems to me like the problem here is readahead, not the iomap
code detecting the OOB read request....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, May 24, 2023 at 08:27:13AM +1000, Dave Chinner wrote:
> On Fri, May 19, 2023 at 04:22:01PM +0200, Hannes Reinecke wrote:
> > I'm hitting this during booting:
> > [ 5.016324] <TASK>
> > [ 5.030256] iomap_iter+0x11a/0x350
> > [ 5.030264] iomap_readahead+0x1eb/0x2c0
> > [ 5.030272] read_pages+0x5d/0x220
> > [ 5.030279] page_cache_ra_unbounded+0x131/0x180
> > [ 5.030284] filemap_get_pages+0xff/0x5a0
>
> Why is filemap_get_pages() using unbounded readahead? Surely
> readahead should be limited to reading within EOF....
It isn't using unbounded readahead; that's an artifact of this
incomplete stack trace. Actual call stack:
page_cache_ra_unbounded
do_page_cache_ra
ondemand_readahead
page_cache_sync_ra
page_cache_sync_readahead
filemap_get_pages
As you can see, do_page_cache_ra() does limit readahead to i_size.
Is ractl->mapping->host the correct way to find the inode? I always
get confused.
> I think Christoph's code is correct. IMO, any attempt to read beyond
> the end of the device should throw out a warning and return an
> error, not silently return zeros.
>
> If readahead is trying to read beyond the end of the device, then it
> really seems to me like the problem here is readahead, not the iomap
> code detecting the OOB read request....
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> [email protected]
On 7/20/23 14:06, Christoph Hellwig wrote:
> On Fri, May 19, 2023 at 04:22:01PM +0200, Hannes Reinecke wrote:
>> I'm hitting this during booting:
>> [ 5.016324] <TASK>
>> [ 5.030256] iomap_iter+0x11a/0x350
>> [ 5.030264] iomap_readahead+0x1eb/0x2c0
>> [ 5.030272] read_pages+0x5d/0x220
>> [ 5.030279] page_cache_ra_unbounded+0x131/0x180
>> [ 5.030284] filemap_get_pages+0xff/0x5a0
>> [ 5.030292] filemap_read+0xca/0x320
>> [ 5.030296] ? aa_file_perm+0x126/0x500
>> [ 5.040216] ? touch_atime+0xc8/0x150
>> [ 5.040224] blkdev_read_iter+0xb0/0x150
>> [ 5.040228] vfs_read+0x226/0x2d0
>> [ 5.040234] ksys_read+0xa5/0xe0
>> [ 5.040238] do_syscall_64+0x5b/0x80
>>
>> Maybe we should consider this patch:
>
> As willy said this should be taken care of by the i_size check.
> Did you run with just this patch set or some of the large block
> size experiments on top which might change the variables?
>
> I'll repost the series today without any chances in the area, and
> if you can reproduce it with just that series we need to root
> cause it, so please send your kernel and VM config along for the
> next report.
I _think_ it's been resolve now; I've rewritten my patchset (and the
patches where it's based upon) several times now, so it might be a stale
issue now.
Eagerly awaiting your patchset.
Cheers,
Hannes