Hi all,
This series allows to build a kernel without buffer_heads, which I
think is useful to show where the dependencies are, and maybe also
for some very much limited environments, where people just needs
xfs and/or btrfs and some of the read-only block based file systems.
It first switches buffered writes (but not writeback) for block devices
to use iomap unconditionally, but still using buffer_heads, and then
adds a CONFIG_BUFFER_HEAD selected by all file systems that need it
(which is most block based file systems), makes the buffer_head support
in iomap optional, and adds an alternative implementation of the block
device address_operations using iomap. This latter implementation
will also be useful to support block size > PAGE_SIZE for block device
nodes as buffer_heads won't work very well for that.
Note that for now the md software raid drivers is also disabled as it has
some (rather questionable) buffer_head usage in the unconditionally built
bitmap code. I have a series pending to make the bitmap code conditional
and deprecated it, but it hasn't been merged yet.
This series is against Jens' for-6.6/block branch.
Changes since v2:
- fix handling of a negative return value from blkdev_direct_IO
- drop a WARN_ON that can happen when resizing block devices
- define away IOMAP_F_BUFFER_HEAD to keep the intrusions to the
iomap code minimal (even if that's not quite my preferred style)
Changes since v1:
- drop the already merged prep patches
- depend on FS_IOMAP not IOMAP
- pick a better new name for block_page_mkwrite_return
Fold emergency_thaw_bdev into it's only caller, to prepare for buffer.c
to be built only when buffer_head support is enabled.
Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Luis Chamberlain <[email protected]>
Reviewed-by: Hannes Reinecke <[email protected]>
Reviewed-by: Johannes Thumshirn <[email protected]>
---
fs/buffer.c | 6 ------
fs/internal.h | 6 ------
fs/super.c | 4 +++-
3 files changed, 3 insertions(+), 13 deletions(-)
diff --git a/fs/buffer.c b/fs/buffer.c
index bd091329026c0f..376f468e16662d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -562,12 +562,6 @@ static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
return err;
}
-void emergency_thaw_bdev(struct super_block *sb)
-{
- while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
- printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
-}
-
/**
* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
* @mapping: the mapping which wants those buffers written
diff --git a/fs/internal.h b/fs/internal.h
index f7a3dc11102647..d538d832fd608b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -23,16 +23,10 @@ struct mnt_idmap;
*/
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
-
-void emergency_thaw_bdev(struct super_block *sb);
#else
static inline void bdev_cache_init(void)
{
}
-static inline int emergency_thaw_bdev(struct super_block *sb)
-{
- return 0;
-}
#endif /* CONFIG_BLOCK */
/*
diff --git a/fs/super.c b/fs/super.c
index e781226e28800c..bc666e7ee1a984 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1029,7 +1029,9 @@ static void do_thaw_all_callback(struct super_block *sb)
{
down_write(&sb->s_umount);
if (sb->s_root && sb->s_flags & SB_BORN) {
- emergency_thaw_bdev(sb);
+ if (IS_ENABLED(CONFIG_BLOCK))
+ while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
+ pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
thaw_super_locked(sb);
} else {
up_write(&sb->s_umount);
--
2.39.2
Open code __generic_file_write_iter to remove the indirect call into
->direct_IO and to prepare using the iomap based write code.
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/fops.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 43 insertions(+), 2 deletions(-)
diff --git a/block/fops.c b/block/fops.c
index a286bf3325c5d8..8a05d99166e3bd 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -533,6 +533,30 @@ static int blkdev_release(struct inode *inode, struct file *filp)
return 0;
}
+static ssize_t
+blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ size_t count = iov_iter_count(from);
+ ssize_t written;
+
+ written = kiocb_invalidate_pages(iocb, count);
+ if (written) {
+ if (written == -EBUSY)
+ return 0;
+ return written;
+ }
+
+ written = blkdev_direct_IO(iocb, from);
+ if (written > 0) {
+ kiocb_invalidate_post_direct_write(iocb, count);
+ iocb->ki_pos += written;
+ count -= written;
+ }
+ if (written != -EIOCBQUEUED)
+ iov_iter_revert(from, count - iov_iter_count(from));
+ return written;
+}
+
/*
* Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device.
@@ -542,7 +566,8 @@ static int blkdev_release(struct inode *inode, struct file *filp)
*/
static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
+ struct file *file = iocb->ki_filp;
+ struct block_device *bdev = I_BDEV(file->f_mapping->host);
struct inode *bd_inode = bdev->bd_inode;
loff_t size = bdev_nr_bytes(bdev);
size_t shorted = 0;
@@ -569,7 +594,23 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
iov_iter_truncate(from, size);
}
- ret = __generic_file_write_iter(iocb, from);
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
+
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = blkdev_direct_write(iocb, from);
+ if (ret >= 0 && iov_iter_count(from))
+ ret = direct_write_fallback(iocb, from, ret,
+ generic_perform_write(iocb, from));
+ } else {
+ ret = generic_perform_write(iocb, from);
+ }
+
if (ret > 0)
ret = generic_write_sync(iocb, ret);
iov_iter_reexpand(from, iov_iter_count(from) + shorted);
--
2.39.2
Add a new config option that controls building the buffer_head code, and
select it from all file systems and stacking drivers that need it.
For the block device nodes and alternative iomap based buffered I/O path
is provided when buffer_head support is not enabled, and iomap needs a
a small tweak to define the IOMAP_F_BUFFER_HEAD flag to 0 to not call
into the buffer_head code when it doesn't exist.
Otherwise this is just Kconfig and ifdef changes.
Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Luis Chamberlain <[email protected]>
---
block/fops.c | 70 ++++++++++++++++++++++++++++++------
drivers/md/Kconfig | 1 +
fs/Kconfig | 4 +++
fs/Makefile | 2 +-
fs/adfs/Kconfig | 1 +
fs/affs/Kconfig | 1 +
fs/befs/Kconfig | 1 +
fs/bfs/Kconfig | 1 +
fs/efs/Kconfig | 1 +
fs/exfat/Kconfig | 1 +
fs/ext2/Kconfig | 1 +
fs/ext4/Kconfig | 1 +
fs/f2fs/Kconfig | 1 +
fs/fat/Kconfig | 1 +
fs/freevxfs/Kconfig | 1 +
fs/gfs2/Kconfig | 1 +
fs/hfs/Kconfig | 1 +
fs/hfsplus/Kconfig | 1 +
fs/hpfs/Kconfig | 1 +
fs/isofs/Kconfig | 1 +
fs/jfs/Kconfig | 1 +
fs/minix/Kconfig | 1 +
fs/nilfs2/Kconfig | 1 +
fs/ntfs/Kconfig | 1 +
fs/ntfs3/Kconfig | 1 +
fs/ocfs2/Kconfig | 1 +
fs/omfs/Kconfig | 1 +
fs/qnx4/Kconfig | 1 +
fs/qnx6/Kconfig | 1 +
fs/reiserfs/Kconfig | 1 +
fs/sysv/Kconfig | 1 +
fs/udf/Kconfig | 1 +
fs/ufs/Kconfig | 1 +
include/linux/buffer_head.h | 32 ++++++++---------
include/linux/iomap.h | 4 +++
include/trace/events/block.h | 2 ++
mm/migrate.c | 4 +--
37 files changed, 119 insertions(+), 29 deletions(-)
diff --git a/block/fops.c b/block/fops.c
index 063ece37d44e44..eaa98a987213d2 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -24,15 +24,6 @@ static inline struct inode *bdev_file_inode(struct file *file)
return file->f_mapping->host;
}
-static int blkdev_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
-{
- bh->b_bdev = I_BDEV(inode);
- bh->b_blocknr = iblock;
- set_buffer_mapped(bh);
- return 0;
-}
-
static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
{
blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
@@ -400,7 +391,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->type = IOMAP_MAPPED;
iomap->addr = iomap->offset;
iomap->length = isize - iomap->offset;
- iomap->flags |= IOMAP_F_BUFFER_HEAD;
+ iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */
return 0;
}
@@ -408,6 +399,16 @@ static const struct iomap_ops blkdev_iomap_ops = {
.iomap_begin = blkdev_iomap_begin,
};
+#ifdef CONFIG_BUFFER_HEAD
+static int blkdev_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ bh->b_bdev = I_BDEV(inode);
+ bh->b_blocknr = iblock;
+ set_buffer_mapped(bh);
+ return 0;
+}
+
static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, blkdev_get_block, wbc);
@@ -453,6 +454,55 @@ const struct address_space_operations def_blk_aops = {
.migrate_folio = buffer_migrate_folio_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback,
};
+#else /* CONFIG_BUFFER_HEAD */
+static int blkdev_read_folio(struct file *file, struct folio *folio)
+{
+ return iomap_read_folio(folio, &blkdev_iomap_ops);
+}
+
+static void blkdev_readahead(struct readahead_control *rac)
+{
+ iomap_readahead(rac, &blkdev_iomap_ops);
+}
+
+static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct inode *inode, loff_t offset)
+{
+ loff_t isize = i_size_read(inode);
+
+ if (WARN_ON_ONCE(offset >= isize))
+ return -EIO;
+ if (offset >= wpc->iomap.offset &&
+ offset < wpc->iomap.offset + wpc->iomap.length)
+ return 0;
+ return blkdev_iomap_begin(inode, offset, isize - offset,
+ IOMAP_WRITE, &wpc->iomap, NULL);
+}
+
+static const struct iomap_writeback_ops blkdev_writeback_ops = {
+ .map_blocks = blkdev_map_blocks,
+};
+
+static int blkdev_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct iomap_writepage_ctx wpc = { };
+
+ return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops);
+}
+
+const struct address_space_operations def_blk_aops = {
+ .dirty_folio = filemap_dirty_folio,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
+ .read_folio = blkdev_read_folio,
+ .readahead = blkdev_readahead,
+ .writepages = blkdev_writepages,
+ .is_partially_uptodate = iomap_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+ .migrate_folio = filemap_migrate_folio,
+};
+#endif /* CONFIG_BUFFER_HEAD */
/*
* for a block special file file_inode(file)->i_size is zero
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 444517d1a2336a..2a8b081bce7dd8 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -15,6 +15,7 @@ if MD
config BLK_DEV_MD
tristate "RAID support"
select BLOCK_HOLDER_DEPRECATED if SYSFS
+ select BUFFER_HEAD
# BLOCK_LEGACY_AUTOLOAD requirement should be removed
# after relevant mdadm enhancements - to make "names=yes"
# the default - are widely available.
diff --git a/fs/Kconfig b/fs/Kconfig
index 18d034ec79539f..e8b17c81b83a8e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -18,8 +18,12 @@ config VALIDATE_FS_PARSER
config FS_IOMAP
bool
+config BUFFER_HEAD
+ bool
+
# old blockdev_direct_IO implementation. Use iomap for new code instead
config LEGACY_DIRECT_IO
+ depends on BUFFER_HEAD
bool
if BLOCK
diff --git a/fs/Makefile b/fs/Makefile
index e513aaee0603a0..f9541f40be4e08 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
kernel_read_file.o mnt_idmapping.o remap_range.o
-obj-$(CONFIG_BLOCK) += buffer.o mpage.o
+obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o
obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o
obj-y += notify/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index 44738fed66251f..1b97058f0c4a92 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -2,6 +2,7 @@
config ADFS_FS
tristate "ADFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
help
The Acorn Disc Filing System is the standard file system of the
RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
index 962b86374e1c15..1ae432d266c32f 100644
--- a/fs/affs/Kconfig
+++ b/fs/affs/Kconfig
@@ -2,6 +2,7 @@
config AFFS_FS
tristate "Amiga FFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
select LEGACY_DIRECT_IO
help
The Fast File System (FFS) is the common file system used on hard
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
index 9550b6462b8147..5fcfc4024ffe6f 100644
--- a/fs/befs/Kconfig
+++ b/fs/befs/Kconfig
@@ -2,6 +2,7 @@
config BEFS_FS
tristate "BeOS file system (BeFS) support (read only)"
depends on BLOCK
+ select BUFFER_HEAD
select NLS
help
The BeOS File System (BeFS) is the native file system of Be, Inc's
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
index 3a757805b58568..8e7ef866b62a62 100644
--- a/fs/bfs/Kconfig
+++ b/fs/bfs/Kconfig
@@ -2,6 +2,7 @@
config BFS_FS
tristate "BFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
help
Boot File System (BFS) is a file system used under SCO UnixWare to
allow the bootloader access to the kernel image and other important
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
index 2df1bac8b375b1..0833e533df9d53 100644
--- a/fs/efs/Kconfig
+++ b/fs/efs/Kconfig
@@ -2,6 +2,7 @@
config EFS_FS
tristate "EFS file system support (read only)"
depends on BLOCK
+ select BUFFER_HEAD
help
EFS is an older file system used for non-ISO9660 CD-ROMs and hard
disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig
index 147edeb044691d..cbeca8e44d9b38 100644
--- a/fs/exfat/Kconfig
+++ b/fs/exfat/Kconfig
@@ -2,6 +2,7 @@
config EXFAT_FS
tristate "exFAT filesystem support"
+ select BUFFER_HEAD
select NLS
select LEGACY_DIRECT_IO
help
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 77393fda99af09..74d98965902e16 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config EXT2_FS
tristate "Second extended fs support"
+ select BUFFER_HEAD
select FS_IOMAP
select LEGACY_DIRECT_IO
help
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 86699c8cab281c..e20d59221fc05b 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -28,6 +28,7 @@ config EXT3_FS_SECURITY
config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
+ select BUFFER_HEAD
select JBD2
select CRC16
select CRYPTO
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 03ef087537c7c4..68a1e23e1557c7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -2,6 +2,7 @@
config F2FS_FS
tristate "F2FS filesystem support"
depends on BLOCK
+ select BUFFER_HEAD
select NLS
select CRYPTO
select CRYPTO_CRC32
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index afe83b4e717280..25fae1c83725bc 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config FAT_FS
tristate
+ select BUFFER_HEAD
select NLS
select LEGACY_DIRECT_IO
help
diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
index 0e2fc08f7de492..912107ebea6f40 100644
--- a/fs/freevxfs/Kconfig
+++ b/fs/freevxfs/Kconfig
@@ -2,6 +2,7 @@
config VXFS_FS
tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
depends on BLOCK
+ select BUFFER_HEAD
help
FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
file system format. VERITAS VxFS(TM) is the standard file system
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 03c966840422ec..be7f87a8e11ae1 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config GFS2_FS
tristate "GFS2 file system support"
+ select BUFFER_HEAD
select FS_POSIX_ACL
select CRC32
select LIBCRC32C
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
index d985066006d588..5ea5cd8ecea9c0 100644
--- a/fs/hfs/Kconfig
+++ b/fs/hfs/Kconfig
@@ -2,6 +2,7 @@
config HFS_FS
tristate "Apple Macintosh file system support"
depends on BLOCK
+ select BUFFER_HEAD
select NLS
select LEGACY_DIRECT_IO
help
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
index 8034e7827a690b..8ce4a33a9ac788 100644
--- a/fs/hfsplus/Kconfig
+++ b/fs/hfsplus/Kconfig
@@ -2,6 +2,7 @@
config HFSPLUS_FS
tristate "Apple Extended HFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
select NLS
select NLS_UTF8
select LEGACY_DIRECT_IO
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index ec975f4668775f..ac1e9318e65a4a 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -2,6 +2,7 @@
config HPFS_FS
tristate "OS/2 HPFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
select FS_IOMAP
help
OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig
index 08ffd37b9bb8f6..51434f2a471b0f 100644
--- a/fs/isofs/Kconfig
+++ b/fs/isofs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config ISO9660_FS
tristate "ISO 9660 CDROM file system support"
+ select BUFFER_HEAD
help
This is the standard file system used on CD-ROMs. It was previously
known as "High Sierra File System" and is called "hsfs" on other
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
index 51e856f0e4b8d6..17488440eef1a9 100644
--- a/fs/jfs/Kconfig
+++ b/fs/jfs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config JFS_FS
tristate "JFS filesystem support"
+ select BUFFER_HEAD
select NLS
select CRC32
select LEGACY_DIRECT_IO
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index de2003974ff0d0..90ddfad2a75e8f 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -2,6 +2,7 @@
config MINIX_FS
tristate "Minix file system support"
depends on BLOCK
+ select BUFFER_HEAD
help
Minix is a simple operating system used in many classes about OS's.
The minix file system (method to organize files on a hard disk
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 7d59567465e121..7dae168e346e30 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config NILFS2_FS
tristate "NILFS2 file system support"
+ select BUFFER_HEAD
select CRC32
select LEGACY_DIRECT_IO
help
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
index f93e69a612833f..7b2509741735a9 100644
--- a/fs/ntfs/Kconfig
+++ b/fs/ntfs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config NTFS_FS
tristate "NTFS file system support"
+ select BUFFER_HEAD
select NLS
help
NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig
index 96cc236f7f7bd3..cdfdf51e55d797 100644
--- a/fs/ntfs3/Kconfig
+++ b/fs/ntfs3/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config NTFS3_FS
tristate "NTFS Read-Write file system support"
+ select BUFFER_HEAD
select NLS
select LEGACY_DIRECT_IO
help
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 3123da7cfb301f..2514d36cbe0157 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -2,6 +2,7 @@
config OCFS2_FS
tristate "OCFS2 file system support"
depends on INET && SYSFS && CONFIGFS_FS
+ select BUFFER_HEAD
select JBD2
select CRC32
select QUOTA
diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig
index 42b2ec35a05bfb..8470f6c3e64e6a 100644
--- a/fs/omfs/Kconfig
+++ b/fs/omfs/Kconfig
@@ -2,6 +2,7 @@
config OMFS_FS
tristate "SonicBlue Optimized MPEG File System support"
depends on BLOCK
+ select BUFFER_HEAD
select CRC_ITU_T
help
This is the proprietary file system used by the Rio Karma music
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
index 45b5b98376c436..a2eb826e76c602 100644
--- a/fs/qnx4/Kconfig
+++ b/fs/qnx4/Kconfig
@@ -2,6 +2,7 @@
config QNX4FS_FS
tristate "QNX4 file system support (read only)"
depends on BLOCK
+ select BUFFER_HEAD
help
This is the file system used by the real-time operating systems
QNX 4 and QNX 6 (the latter is also called QNX RTP).
diff --git a/fs/qnx6/Kconfig b/fs/qnx6/Kconfig
index 6a9d6bce158622..8e865d72204e75 100644
--- a/fs/qnx6/Kconfig
+++ b/fs/qnx6/Kconfig
@@ -2,6 +2,7 @@
config QNX6FS_FS
tristate "QNX6 file system support (read only)"
depends on BLOCK && CRC32
+ select BUFFER_HEAD
help
This is the file system used by the real-time operating systems
QNX 6 (also called QNX RTP).
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 4d22ecfe0fab65..0e6fe26458fede 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config REISERFS_FS
tristate "Reiserfs support (deprecated)"
+ select BUFFER_HEAD
select CRC32
select LEGACY_DIRECT_IO
help
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
index b4e23e03fbeba3..67b3f90afbfd67 100644
--- a/fs/sysv/Kconfig
+++ b/fs/sysv/Kconfig
@@ -2,6 +2,7 @@
config SYSV_FS
tristate "System V/Xenix/V7/Coherent file system support"
depends on BLOCK
+ select BUFFER_HEAD
help
SCO, Xenix and Coherent are commercial Unix systems for Intel
machines, and Version 7 was used on the DEC PDP-11. Saying Y
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 82e8bfa2dfd989..8f7ce30d47fdce 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config UDF_FS
tristate "UDF file system support"
+ select BUFFER_HEAD
select CRC_ITU_T
select NLS
select LEGACY_DIRECT_IO
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index 6d30adb6b890fc..9301e7ecd09210 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -2,6 +2,7 @@
config UFS_FS
tristate "UFS file system support (read only)"
depends on BLOCK
+ select BUFFER_HEAD
help
BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7002a9ff63a3da..c89ef50d5112fc 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -16,8 +16,6 @@
#include <linux/wait.h>
#include <linux/atomic.h>
-#ifdef CONFIG_BLOCK
-
enum bh_state_bits {
BH_Uptodate, /* Contains valid data */
BH_Dirty, /* Is dirty */
@@ -198,7 +196,6 @@ void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset);
void folio_set_bh(struct buffer_head *bh, struct folio *folio,
unsigned long offset);
-bool try_to_free_buffers(struct folio *);
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
bool retry);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
@@ -213,10 +210,6 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate);
/* Things to do with buffers at mapping->private_list */
void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
-int inode_has_buffers(struct inode *);
-void invalidate_inode_buffers(struct inode *);
-int remove_inode_buffers(struct inode *inode);
-int sync_mapping_buffers(struct address_space *mapping);
int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
bool datasync);
int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
@@ -240,9 +233,6 @@ void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
struct buffer_head *__bread_gfp(struct block_device *,
sector_t block, unsigned size, gfp_t gfp);
-void invalidate_bh_lrus(void);
-void invalidate_bh_lrus_cpu(void);
-bool has_bh_in_lru(int cpu, void *dummy);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh);
@@ -258,8 +248,6 @@ int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
void __bh_read_batch(int nr, struct buffer_head *bhs[],
blk_opf_t op_flags, bool force_lock);
-extern int buffer_heads_over_limit;
-
/*
* Generic address_space_operations implementations for buffer_head-backed
* address_spaces.
@@ -304,8 +292,6 @@ extern int buffer_migrate_folio_norefs(struct address_space *,
#define buffer_migrate_folio_norefs NULL
#endif
-void buffer_init(void);
-
/*
* inline definitions
*/
@@ -465,7 +451,20 @@ __bread(struct block_device *bdev, sector_t block, unsigned size)
bool block_dirty_folio(struct address_space *mapping, struct folio *folio);
-#else /* CONFIG_BLOCK */
+#ifdef CONFIG_BUFFER_HEAD
+
+void buffer_init(void);
+bool try_to_free_buffers(struct folio *folio);
+int inode_has_buffers(struct inode *inode);
+void invalidate_inode_buffers(struct inode *inode);
+int remove_inode_buffers(struct inode *inode);
+int sync_mapping_buffers(struct address_space *mapping);
+void invalidate_bh_lrus(void);
+void invalidate_bh_lrus_cpu(void);
+bool has_bh_in_lru(int cpu, void *dummy);
+extern int buffer_heads_over_limit;
+
+#else /* CONFIG_BUFFER_HEAD */
static inline void buffer_init(void) {}
static inline bool try_to_free_buffers(struct folio *folio) { return true; }
@@ -473,9 +472,10 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; }
static inline void invalidate_inode_buffers(struct inode *inode) {}
static inline int remove_inode_buffers(struct inode *inode) { return 1; }
static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
+static inline void invalidate_bh_lrus(void) {}
static inline void invalidate_bh_lrus_cpu(void) {}
static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; }
#define buffer_heads_over_limit 0
-#endif /* CONFIG_BLOCK */
+#endif /* CONFIG_BUFFER_HEAD */
#endif /* _LINUX_BUFFER_HEAD_H */
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e2b836c2e119ae..54f50d34fd9d4f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -58,7 +58,11 @@ struct vm_fault;
#define IOMAP_F_DIRTY (1U << 1)
#define IOMAP_F_SHARED (1U << 2)
#define IOMAP_F_MERGED (1U << 3)
+#ifdef CONFIG_BUFFER_HEAD
#define IOMAP_F_BUFFER_HEAD (1U << 4)
+#else
+#define IOMAP_F_BUFFER_HEAD 0
+#endif /* CONFIG_BUFFER_HEAD */
#define IOMAP_F_XATTR (1U << 5)
/*
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 40e60c33cc6f3d..0e128ad5146015 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -12,6 +12,7 @@
#define RWBS_LEN 8
+#ifdef CONFIG_BUFFER_HEAD
DECLARE_EVENT_CLASS(block_buffer,
TP_PROTO(struct buffer_head *bh),
@@ -61,6 +62,7 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer,
TP_ARGS(bh)
);
+#endif /* CONFIG_BUFFER_HEAD */
/**
* block_rq_requeue - place block IO request back on a queue
diff --git a/mm/migrate.c b/mm/migrate.c
index 24baad2571e314..fe6f8d454aff83 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -684,7 +684,7 @@ int migrate_folio(struct address_space *mapping, struct folio *dst,
}
EXPORT_SYMBOL(migrate_folio);
-#ifdef CONFIG_BLOCK
+#ifdef CONFIG_BUFFER_HEAD
/* Returns true if all buffers are successfully locked */
static bool buffer_migrate_lock_buffers(struct buffer_head *head,
enum migrate_mode mode)
@@ -837,7 +837,7 @@ int buffer_migrate_folio_norefs(struct address_space *mapping,
return __buffer_migrate_folio(mapping, dst, src, mode, true);
}
EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
-#endif
+#endif /* CONFIG_BUFFER_HEAD */
int filemap_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode)
--
2.39.2
On 8/1/23 19:21, Christoph Hellwig wrote:
> Open code __generic_file_write_iter to remove the indirect call into
> ->direct_IO and to prepare using the iomap based write code.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> block/fops.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 43 insertions(+), 2 deletions(-)
>
Reviewed-by: Hannes Reinecke <[email protected]>
Cheers,
Hannes
Use iomap in buffer_head compat mode to write to block devices.
Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Luis Chamberlain <[email protected]>
Reviewed-by: Pankaj Raghav <[email protected]>
Reviewed-by: Hannes Reinecke <[email protected]>
---
block/Kconfig | 1 +
block/fops.c | 31 +++++++++++++++++++++++++++++--
2 files changed, 30 insertions(+), 2 deletions(-)
diff --git a/block/Kconfig b/block/Kconfig
index 86122e459fe046..1a13ef0b1ca10c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,6 +5,7 @@
menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
+ select FS_IOMAP
select SBITMAP
help
Provide block layer support for the kernel.
diff --git a/block/fops.c b/block/fops.c
index f0b822c28ddfe2..063ece37d44e44 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -15,6 +15,7 @@
#include <linux/falloc.h>
#include <linux/suspend.h>
#include <linux/fs.h>
+#include <linux/iomap.h>
#include <linux/module.h>
#include "blk.h"
@@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
}
+static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+ struct block_device *bdev = I_BDEV(inode);
+ loff_t isize = i_size_read(inode);
+
+ iomap->bdev = bdev;
+ iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
+ if (iomap->offset >= isize)
+ return -EIO;
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = iomap->offset;
+ iomap->length = isize - iomap->offset;
+ iomap->flags |= IOMAP_F_BUFFER_HEAD;
+ return 0;
+}
+
+static const struct iomap_ops blkdev_iomap_ops = {
+ .iomap_begin = blkdev_iomap_begin,
+};
+
static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, blkdev_get_block, wbc);
@@ -556,6 +578,11 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
return written;
}
+static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
+}
+
/*
* Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device.
@@ -605,9 +632,9 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = blkdev_direct_write(iocb, from);
if (ret >= 0 && iov_iter_count(from))
ret = direct_write_fallback(iocb, from, ret,
- generic_perform_write(iocb, from));
+ blkdev_buffered_write(iocb, from));
} else {
- ret = generic_perform_write(iocb, from);
+ ret = blkdev_buffered_write(iocb, from);
}
if (ret > 0)
--
2.39.2
On Tue, Aug 01, 2023 at 07:21:58PM +0200, Christoph Hellwig wrote:
> Open code __generic_file_write_iter to remove the indirect call into
> ->direct_IO and to prepare using the iomap based write code.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Luis Chamberlain <[email protected]>
Luis
On Tue, Aug 01, 2023 at 07:21:56PM +0200, Christoph Hellwig wrote:
> Fold emergency_thaw_bdev into it's only caller, to prepare for buffer.c
> to be built only when buffer_head support is enabled.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Reviewed-by: Luis Chamberlain <[email protected]>
> Reviewed-by: Hannes Reinecke <[email protected]>
> Reviewed-by: Johannes Thumshirn <[email protected]>
> ---
Reviewed-by: Christian Brauner <[email protected]>
On Tue, Aug 01, 2023 at 07:22:00PM +0200, Christoph Hellwig wrote:
> Use iomap in buffer_head compat mode to write to block devices.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Reviewed-by: Luis Chamberlain <[email protected]>
> Reviewed-by: Pankaj Raghav <[email protected]>
> Reviewed-by: Hannes Reinecke <[email protected]>
> ---
Reviewed-by: Christian Brauner <[email protected]>
On Tue, Aug 01, 2023 at 07:21:58PM +0200, Christoph Hellwig wrote:
> Open code __generic_file_write_iter to remove the indirect call into
> ->direct_IO and to prepare using the iomap based write code.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
Reviewed-by: Christian Brauner <[email protected]>
Looks good,
Reviewed-by: Johannes Thumshirn <[email protected]>
Looks good,
Reviewed-by: Johannes Thumshirn <[email protected]>
Looks good,
Reviewed-by: Johannes Thumshirn <[email protected]>
On Tue, 01 Aug 2023 19:21:56 +0200, Christoph Hellwig wrote:
> Fold emergency_thaw_bdev into it's only caller, to prepare for buffer.c
> to be built only when buffer_head support is enabled.
>
>
Applied, thanks!
[1/6] fs: remove emergency_thaw_bdev
commit: 4a8b719f95c0dcd15fb7a04b806ad8139fa7c850
[2/6] fs: rename and move block_page_mkwrite_return
commit: 2ba39cc46bfe463cb9673bf62a04c4c21942f1f2
[3/6] block: open code __generic_file_write_iter for blkdev writes
commit: 727cfe976758b79f8d2f8051c75a5ccb14539a56
[4/6] block: stop setting ->direct_IO
commit: a05f7bd9578b17521a9a5f3689f3934c082c6390
[5/6] block: use iomap for writes to block devices
commit: 487c607df790d366e67a7d6a30adf785cdd98e55
[6/6] fs: add CONFIG_BUFFER_HEAD
commit: 925c86a19bacf8ce10eb666328fb3fa5aff7b951
Best regards,
--
Jens Axboe
On Tue, Aug 29, 2023 at 03:06:14AM +0100, Al Viro wrote:
> On Tue, Aug 01, 2023 at 07:21:58PM +0200, Christoph Hellwig wrote:
> > @@ -569,7 +594,23 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
> > iov_iter_truncate(from, size);
> > }
> >
> > - ret = __generic_file_write_iter(iocb, from);
> > + ret = file_remove_privs(file);
> > + if (ret)
> > + return ret;
>
> That chunk is a bit of a WTF generator... Thankfully,
>
> static int __file_remove_privs(struct file *file, unsigned int flags)
> {
> struct dentry *dentry = file_dentry(file);
> struct inode *inode = file_inode(file);
> int error = 0;
> int kill;
>
> if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
> return 0;
>
> means that it's really a no-op. But I'd still suggest
> removing it, just to reduce the amount of head-scratching
> for people who'll be reading that code later...
I'll send an incremental patch to remove it once the changes hit
Linus' tree.
Hi Christoph,
On Tue, Aug 01, 2023 at 07:22:00PM +0200, Christoph Hellwig wrote:
> Use iomap in buffer_head compat mode to write to block devices.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Reviewed-by: Luis Chamberlain <[email protected]>
> Reviewed-by: Pankaj Raghav <[email protected]>
> Reviewed-by: Hannes Reinecke <[email protected]>
> ---
> block/Kconfig | 1 +
> block/fops.c | 31 +++++++++++++++++++++++++++++--
> 2 files changed, 30 insertions(+), 2 deletions(-)
>
> diff --git a/block/Kconfig b/block/Kconfig
> index 86122e459fe046..1a13ef0b1ca10c 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -5,6 +5,7 @@
> menuconfig BLOCK
> bool "Enable the block layer" if EXPERT
> default y
> + select FS_IOMAP
> select SBITMAP
> help
> Provide block layer support for the kernel.
> diff --git a/block/fops.c b/block/fops.c
> index f0b822c28ddfe2..063ece37d44e44 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -15,6 +15,7 @@
> #include <linux/falloc.h>
> #include <linux/suspend.h>
> #include <linux/fs.h>
> +#include <linux/iomap.h>
> #include <linux/module.h>
> #include "blk.h"
>
> @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
> }
>
> +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> + unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
> +{
> + struct block_device *bdev = I_BDEV(inode);
> + loff_t isize = i_size_read(inode);
> +
> + iomap->bdev = bdev;
> + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
> + if (iomap->offset >= isize)
> + return -EIO;
> + iomap->type = IOMAP_MAPPED;
> + iomap->addr = iomap->offset;
> + iomap->length = isize - iomap->offset;
> + iomap->flags |= IOMAP_F_BUFFER_HEAD;
> + return 0;
> +}
> +
> +static const struct iomap_ops blkdev_iomap_ops = {
> + .iomap_begin = blkdev_iomap_begin,
> +};
> +
> static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
> {
> return block_write_full_page(page, blkdev_get_block, wbc);
> @@ -556,6 +578,11 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
> return written;
> }
>
> +static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
> +{
> + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
> +}
> +
> /*
> * Write data to the block device. Only intended for the block device itself
> * and the raw driver which basically is a fake block device.
> @@ -605,9 +632,9 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
> ret = blkdev_direct_write(iocb, from);
> if (ret >= 0 && iov_iter_count(from))
> ret = direct_write_fallback(iocb, from, ret,
> - generic_perform_write(iocb, from));
> + blkdev_buffered_write(iocb, from));
> } else {
> - ret = generic_perform_write(iocb, from);
> + ret = blkdev_buffered_write(iocb, from);
> }
>
> if (ret > 0)
I'm testing SSD block device write performance recently. I found the write
speed descrased greatly on my board (330MB/s -> 130MB/s). Then I spent some
time to find cause, finally find that it's caused by this patch and if I
revert this patch, write speed can recover to 330MB/s.
I'm using below command to test write performance:
dd if=/dev/zero of=/dev/sda bs=4M count=1024
And I also do more tests to get more findings. In short, I found write
speed changes with the "bs=" parameter.
I totally write 4GB data to sda for each test, the results as below:
- dd if=/dev/zero of=/dev/sda bs=400K count=10485 (334 MB/s)
- dd if=/dev/zero of=/dev/sda bs=800K count=5242 (278 MB/s)
- dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (204 MB/s)
- dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (170 MB/s)
- dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (150 MB/s)
- dd if=/dev/zero of=/dev/sda bs=4500K count=932 (139 MB/s)
When this patch reverted, I got below results:
- dd if=/dev/zero of=/dev/sda bs=400K count=10485 (339 MB/s)
- dd if=/dev/zero of=/dev/sda bs=800K count=5242 (330 MB/s)
- dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (332 MB/s)
- dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (333 MB/s)
- dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (333 MB/s)
- dd if=/dev/zero of=/dev/sda bs=4500K count=932 (333 MB/s)
I just want to know if this results is expected when uses iomap, or it's
a real issue?
Many thanks in advance!
Best Regards,
Xu Yang
> --
> 2.39.2
>
On Fri, Apr 26, 2024 at 06:37:27PM +0800, Xu Yang wrote:
> Hi Christoph,
>
> On Tue, Aug 01, 2023 at 07:22:00PM +0200, Christoph Hellwig wrote:
> > Use iomap in buffer_head compat mode to write to block devices.
> >
> > Signed-off-by: Christoph Hellwig <[email protected]>
> > Reviewed-by: Luis Chamberlain <[email protected]>
> > Reviewed-by: Pankaj Raghav <[email protected]>
> > Reviewed-by: Hannes Reinecke <[email protected]>
> > ---
> > block/Kconfig | 1 +
> > block/fops.c | 31 +++++++++++++++++++++++++++++--
> > 2 files changed, 30 insertions(+), 2 deletions(-)
> >
> > diff --git a/block/Kconfig b/block/Kconfig
> > index 86122e459fe046..1a13ef0b1ca10c 100644
> > --- a/block/Kconfig
> > +++ b/block/Kconfig
> > @@ -5,6 +5,7 @@
> > menuconfig BLOCK
> > bool "Enable the block layer" if EXPERT
> > default y
> > + select FS_IOMAP
> > select SBITMAP
> > help
> > Provide block layer support for the kernel.
> > diff --git a/block/fops.c b/block/fops.c
> > index f0b822c28ddfe2..063ece37d44e44 100644
> > --- a/block/fops.c
> > +++ b/block/fops.c
> > @@ -15,6 +15,7 @@
> > #include <linux/falloc.h>
> > #include <linux/suspend.h>
> > #include <linux/fs.h>
> > +#include <linux/iomap.h>
> > #include <linux/module.h>
> > #include "blk.h"
> >
> > @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> > return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
> > }
> >
> > +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> > + unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
> > +{
> > + struct block_device *bdev = I_BDEV(inode);
> > + loff_t isize = i_size_read(inode);
> > +
> > + iomap->bdev = bdev;
> > + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
> > + if (iomap->offset >= isize)
> > + return -EIO;
> > + iomap->type = IOMAP_MAPPED;
> > + iomap->addr = iomap->offset;
> > + iomap->length = isize - iomap->offset;
> > + iomap->flags |= IOMAP_F_BUFFER_HEAD;
> > + return 0;
> > +}
> > +
> > +static const struct iomap_ops blkdev_iomap_ops = {
> > + .iomap_begin = blkdev_iomap_begin,
> > +};
> > +
> > static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
> > {
> > return block_write_full_page(page, blkdev_get_block, wbc);
> > @@ -556,6 +578,11 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
> > return written;
> > }
> >
> > +static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
> > +{
> > + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
> > +}
> > +
> > /*
> > * Write data to the block device. Only intended for the block device itself
> > * and the raw driver which basically is a fake block device.
> > @@ -605,9 +632,9 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
> > ret = blkdev_direct_write(iocb, from);
> > if (ret >= 0 && iov_iter_count(from))
> > ret = direct_write_fallback(iocb, from, ret,
> > - generic_perform_write(iocb, from));
> > + blkdev_buffered_write(iocb, from));
> > } else {
> > - ret = generic_perform_write(iocb, from);
> > + ret = blkdev_buffered_write(iocb, from);
> > }
> >
> > if (ret > 0)
>
> I'm testing SSD block device write performance recently. I found the write
> speed descrased greatly on my board (330MB/s -> 130MB/s). Then I spent some
> time to find cause, finally find that it's caused by this patch and if I
> revert this patch, write speed can recover to 330MB/s.
>
> I'm using below command to test write performance:
> dd if=/dev/zero of=/dev/sda bs=4M count=1024
>
> And I also do more tests to get more findings. In short, I found write
> speed changes with the "bs=" parameter.
>
> I totally write 4GB data to sda for each test, the results as below:
>
> - dd if=/dev/zero of=/dev/sda bs=400K count=10485 (334 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=800K count=5242 (278 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (204 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (170 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (150 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=4500K count=932 (139 MB/s)
>
> When this patch reverted, I got below results:
>
> - dd if=/dev/zero of=/dev/sda bs=400K count=10485 (339 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=800K count=5242 (330 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (332 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (333 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (333 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=4500K count=932 (333 MB/s)
>
> I just want to know if this results is expected when uses iomap, or it's
> a real issue?
>
> Many thanks in advance!
A gentle ping.
>
> Best Regards,
> Xu Yang
>
> > --
> > 2.39.2
> >