2023-08-01 18:58:45

by Christoph Hellwig

[permalink] [raw]
Subject: allow building a kernel without buffer_heads v3

Hi all,

This series allows to build a kernel without buffer_heads, which I
think is useful to show where the dependencies are, and maybe also
for some very much limited environments, where people just needs
xfs and/or btrfs and some of the read-only block based file systems.

It first switches buffered writes (but not writeback) for block devices
to use iomap unconditionally, but still using buffer_heads, and then
adds a CONFIG_BUFFER_HEAD selected by all file systems that need it
(which is most block based file systems), makes the buffer_head support
in iomap optional, and adds an alternative implementation of the block
device address_operations using iomap. This latter implementation
will also be useful to support block size > PAGE_SIZE for block device
nodes as buffer_heads won't work very well for that.

Note that for now the md software raid drivers is also disabled as it has
some (rather questionable) buffer_head usage in the unconditionally built
bitmap code. I have a series pending to make the bitmap code conditional
and deprecated it, but it hasn't been merged yet.

This series is against Jens' for-6.6/block branch.

Changes since v2:
- fix handling of a negative return value from blkdev_direct_IO
- drop a WARN_ON that can happen when resizing block devices
- define away IOMAP_F_BUFFER_HEAD to keep the intrusions to the
iomap code minimal (even if that's not quite my preferred style)

Changes since v1:
- drop the already merged prep patches
- depend on FS_IOMAP not IOMAP
- pick a better new name for block_page_mkwrite_return


2023-08-01 18:59:49

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 1/6] fs: remove emergency_thaw_bdev

Fold emergency_thaw_bdev into it's only caller, to prepare for buffer.c
to be built only when buffer_head support is enabled.

Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Luis Chamberlain <[email protected]>
Reviewed-by: Hannes Reinecke <[email protected]>
Reviewed-by: Johannes Thumshirn <[email protected]>
---
fs/buffer.c | 6 ------
fs/internal.h | 6 ------
fs/super.c | 4 +++-
3 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index bd091329026c0f..376f468e16662d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -562,12 +562,6 @@ static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
return err;
}

-void emergency_thaw_bdev(struct super_block *sb)
-{
- while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
- printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
-}
-
/**
* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
* @mapping: the mapping which wants those buffers written
diff --git a/fs/internal.h b/fs/internal.h
index f7a3dc11102647..d538d832fd608b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -23,16 +23,10 @@ struct mnt_idmap;
*/
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
-
-void emergency_thaw_bdev(struct super_block *sb);
#else
static inline void bdev_cache_init(void)
{
}
-static inline int emergency_thaw_bdev(struct super_block *sb)
-{
- return 0;
-}
#endif /* CONFIG_BLOCK */

/*
diff --git a/fs/super.c b/fs/super.c
index e781226e28800c..bc666e7ee1a984 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1029,7 +1029,9 @@ static void do_thaw_all_callback(struct super_block *sb)
{
down_write(&sb->s_umount);
if (sb->s_root && sb->s_flags & SB_BORN) {
- emergency_thaw_bdev(sb);
+ if (IS_ENABLED(CONFIG_BLOCK))
+ while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
+ pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
thaw_super_locked(sb);
} else {
up_write(&sb->s_umount);
--
2.39.2


2023-08-01 19:10:17

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 3/6] block: open code __generic_file_write_iter for blkdev writes

Open code __generic_file_write_iter to remove the indirect call into
->direct_IO and to prepare using the iomap based write code.

Signed-off-by: Christoph Hellwig <[email protected]>
---
block/fops.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/block/fops.c b/block/fops.c
index a286bf3325c5d8..8a05d99166e3bd 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -533,6 +533,30 @@ static int blkdev_release(struct inode *inode, struct file *filp)
return 0;
}

+static ssize_t
+blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ size_t count = iov_iter_count(from);
+ ssize_t written;
+
+ written = kiocb_invalidate_pages(iocb, count);
+ if (written) {
+ if (written == -EBUSY)
+ return 0;
+ return written;
+ }
+
+ written = blkdev_direct_IO(iocb, from);
+ if (written > 0) {
+ kiocb_invalidate_post_direct_write(iocb, count);
+ iocb->ki_pos += written;
+ count -= written;
+ }
+ if (written != -EIOCBQUEUED)
+ iov_iter_revert(from, count - iov_iter_count(from));
+ return written;
+}
+
/*
* Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device.
@@ -542,7 +566,8 @@ static int blkdev_release(struct inode *inode, struct file *filp)
*/
static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
+ struct file *file = iocb->ki_filp;
+ struct block_device *bdev = I_BDEV(file->f_mapping->host);
struct inode *bd_inode = bdev->bd_inode;
loff_t size = bdev_nr_bytes(bdev);
size_t shorted = 0;
@@ -569,7 +594,23 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
iov_iter_truncate(from, size);
}

- ret = __generic_file_write_iter(iocb, from);
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
+
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = blkdev_direct_write(iocb, from);
+ if (ret >= 0 && iov_iter_count(from))
+ ret = direct_write_fallback(iocb, from, ret,
+ generic_perform_write(iocb, from));
+ } else {
+ ret = generic_perform_write(iocb, from);
+ }
+
if (ret > 0)
ret = generic_write_sync(iocb, ret);
iov_iter_reexpand(from, iov_iter_count(from) + shorted);
--
2.39.2


2023-08-01 19:10:49

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 6/6] fs: add CONFIG_BUFFER_HEAD

Add a new config option that controls building the buffer_head code, and
select it from all file systems and stacking drivers that need it.

For the block device nodes and alternative iomap based buffered I/O path
is provided when buffer_head support is not enabled, and iomap needs a
a small tweak to define the IOMAP_F_BUFFER_HEAD flag to 0 to not call
into the buffer_head code when it doesn't exist.

Otherwise this is just Kconfig and ifdef changes.

Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Luis Chamberlain <[email protected]>
---
block/fops.c | 70 ++++++++++++++++++++++++++++++------
drivers/md/Kconfig | 1 +
fs/Kconfig | 4 +++
fs/Makefile | 2 +-
fs/adfs/Kconfig | 1 +
fs/affs/Kconfig | 1 +
fs/befs/Kconfig | 1 +
fs/bfs/Kconfig | 1 +
fs/efs/Kconfig | 1 +
fs/exfat/Kconfig | 1 +
fs/ext2/Kconfig | 1 +
fs/ext4/Kconfig | 1 +
fs/f2fs/Kconfig | 1 +
fs/fat/Kconfig | 1 +
fs/freevxfs/Kconfig | 1 +
fs/gfs2/Kconfig | 1 +
fs/hfs/Kconfig | 1 +
fs/hfsplus/Kconfig | 1 +
fs/hpfs/Kconfig | 1 +
fs/isofs/Kconfig | 1 +
fs/jfs/Kconfig | 1 +
fs/minix/Kconfig | 1 +
fs/nilfs2/Kconfig | 1 +
fs/ntfs/Kconfig | 1 +
fs/ntfs3/Kconfig | 1 +
fs/ocfs2/Kconfig | 1 +
fs/omfs/Kconfig | 1 +
fs/qnx4/Kconfig | 1 +
fs/qnx6/Kconfig | 1 +
fs/reiserfs/Kconfig | 1 +
fs/sysv/Kconfig | 1 +
fs/udf/Kconfig | 1 +
fs/ufs/Kconfig | 1 +
include/linux/buffer_head.h | 32 ++++++++---------
include/linux/iomap.h | 4 +++
include/trace/events/block.h | 2 ++
mm/migrate.c | 4 +--
37 files changed, 119 insertions(+), 29 deletions(-)

diff --git a/block/fops.c b/block/fops.c
index 063ece37d44e44..eaa98a987213d2 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -24,15 +24,6 @@ static inline struct inode *bdev_file_inode(struct file *file)
return file->f_mapping->host;
}

-static int blkdev_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
-{
- bh->b_bdev = I_BDEV(inode);
- bh->b_blocknr = iblock;
- set_buffer_mapped(bh);
- return 0;
-}
-
static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
{
blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
@@ -400,7 +391,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->type = IOMAP_MAPPED;
iomap->addr = iomap->offset;
iomap->length = isize - iomap->offset;
- iomap->flags |= IOMAP_F_BUFFER_HEAD;
+ iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */
return 0;
}

@@ -408,6 +399,16 @@ static const struct iomap_ops blkdev_iomap_ops = {
.iomap_begin = blkdev_iomap_begin,
};

+#ifdef CONFIG_BUFFER_HEAD
+static int blkdev_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ bh->b_bdev = I_BDEV(inode);
+ bh->b_blocknr = iblock;
+ set_buffer_mapped(bh);
+ return 0;
+}
+
static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, blkdev_get_block, wbc);
@@ -453,6 +454,55 @@ const struct address_space_operations def_blk_aops = {
.migrate_folio = buffer_migrate_folio_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback,
};
+#else /* CONFIG_BUFFER_HEAD */
+static int blkdev_read_folio(struct file *file, struct folio *folio)
+{
+ return iomap_read_folio(folio, &blkdev_iomap_ops);
+}
+
+static void blkdev_readahead(struct readahead_control *rac)
+{
+ iomap_readahead(rac, &blkdev_iomap_ops);
+}
+
+static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct inode *inode, loff_t offset)
+{
+ loff_t isize = i_size_read(inode);
+
+ if (WARN_ON_ONCE(offset >= isize))
+ return -EIO;
+ if (offset >= wpc->iomap.offset &&
+ offset < wpc->iomap.offset + wpc->iomap.length)
+ return 0;
+ return blkdev_iomap_begin(inode, offset, isize - offset,
+ IOMAP_WRITE, &wpc->iomap, NULL);
+}
+
+static const struct iomap_writeback_ops blkdev_writeback_ops = {
+ .map_blocks = blkdev_map_blocks,
+};
+
+static int blkdev_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct iomap_writepage_ctx wpc = { };
+
+ return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops);
+}
+
+const struct address_space_operations def_blk_aops = {
+ .dirty_folio = filemap_dirty_folio,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
+ .read_folio = blkdev_read_folio,
+ .readahead = blkdev_readahead,
+ .writepages = blkdev_writepages,
+ .is_partially_uptodate = iomap_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+ .migrate_folio = filemap_migrate_folio,
+};
+#endif /* CONFIG_BUFFER_HEAD */

/*
* for a block special file file_inode(file)->i_size is zero
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 444517d1a2336a..2a8b081bce7dd8 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -15,6 +15,7 @@ if MD
config BLK_DEV_MD
tristate "RAID support"
select BLOCK_HOLDER_DEPRECATED if SYSFS
+ select BUFFER_HEAD
# BLOCK_LEGACY_AUTOLOAD requirement should be removed
# after relevant mdadm enhancements - to make "names=yes"
# the default - are widely available.
diff --git a/fs/Kconfig b/fs/Kconfig
index 18d034ec79539f..e8b17c81b83a8e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -18,8 +18,12 @@ config VALIDATE_FS_PARSER
config FS_IOMAP
bool

+config BUFFER_HEAD
+ bool
+
# old blockdev_direct_IO implementation. Use iomap for new code instead
config LEGACY_DIRECT_IO
+ depends on BUFFER_HEAD
bool

if BLOCK
diff --git a/fs/Makefile b/fs/Makefile
index e513aaee0603a0..f9541f40be4e08 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
kernel_read_file.o mnt_idmapping.o remap_range.o

-obj-$(CONFIG_BLOCK) += buffer.o mpage.o
+obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o
obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o
obj-y += notify/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index 44738fed66251f..1b97058f0c4a92 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -2,6 +2,7 @@
config ADFS_FS
tristate "ADFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
help
The Acorn Disc Filing System is the standard file system of the
RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
index 962b86374e1c15..1ae432d266c32f 100644
--- a/fs/affs/Kconfig
+++ b/fs/affs/Kconfig
@@ -2,6 +2,7 @@
config AFFS_FS
tristate "Amiga FFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
select LEGACY_DIRECT_IO
help
The Fast File System (FFS) is the common file system used on hard
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
index 9550b6462b8147..5fcfc4024ffe6f 100644
--- a/fs/befs/Kconfig
+++ b/fs/befs/Kconfig
@@ -2,6 +2,7 @@
config BEFS_FS
tristate "BeOS file system (BeFS) support (read only)"
depends on BLOCK
+ select BUFFER_HEAD
select NLS
help
The BeOS File System (BeFS) is the native file system of Be, Inc's
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
index 3a757805b58568..8e7ef866b62a62 100644
--- a/fs/bfs/Kconfig
+++ b/fs/bfs/Kconfig
@@ -2,6 +2,7 @@
config BFS_FS
tristate "BFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
help
Boot File System (BFS) is a file system used under SCO UnixWare to
allow the bootloader access to the kernel image and other important
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
index 2df1bac8b375b1..0833e533df9d53 100644
--- a/fs/efs/Kconfig
+++ b/fs/efs/Kconfig
@@ -2,6 +2,7 @@
config EFS_FS
tristate "EFS file system support (read only)"
depends on BLOCK
+ select BUFFER_HEAD
help
EFS is an older file system used for non-ISO9660 CD-ROMs and hard
disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig
index 147edeb044691d..cbeca8e44d9b38 100644
--- a/fs/exfat/Kconfig
+++ b/fs/exfat/Kconfig
@@ -2,6 +2,7 @@

config EXFAT_FS
tristate "exFAT filesystem support"
+ select BUFFER_HEAD
select NLS
select LEGACY_DIRECT_IO
help
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 77393fda99af09..74d98965902e16 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config EXT2_FS
tristate "Second extended fs support"
+ select BUFFER_HEAD
select FS_IOMAP
select LEGACY_DIRECT_IO
help
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 86699c8cab281c..e20d59221fc05b 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -28,6 +28,7 @@ config EXT3_FS_SECURITY

config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
+ select BUFFER_HEAD
select JBD2
select CRC16
select CRYPTO
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 03ef087537c7c4..68a1e23e1557c7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -2,6 +2,7 @@
config F2FS_FS
tristate "F2FS filesystem support"
depends on BLOCK
+ select BUFFER_HEAD
select NLS
select CRYPTO
select CRYPTO_CRC32
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index afe83b4e717280..25fae1c83725bc 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config FAT_FS
tristate
+ select BUFFER_HEAD
select NLS
select LEGACY_DIRECT_IO
help
diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
index 0e2fc08f7de492..912107ebea6f40 100644
--- a/fs/freevxfs/Kconfig
+++ b/fs/freevxfs/Kconfig
@@ -2,6 +2,7 @@
config VXFS_FS
tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
depends on BLOCK
+ select BUFFER_HEAD
help
FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
file system format. VERITAS VxFS(TM) is the standard file system
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 03c966840422ec..be7f87a8e11ae1 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config GFS2_FS
tristate "GFS2 file system support"
+ select BUFFER_HEAD
select FS_POSIX_ACL
select CRC32
select LIBCRC32C
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
index d985066006d588..5ea5cd8ecea9c0 100644
--- a/fs/hfs/Kconfig
+++ b/fs/hfs/Kconfig
@@ -2,6 +2,7 @@
config HFS_FS
tristate "Apple Macintosh file system support"
depends on BLOCK
+ select BUFFER_HEAD
select NLS
select LEGACY_DIRECT_IO
help
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
index 8034e7827a690b..8ce4a33a9ac788 100644
--- a/fs/hfsplus/Kconfig
+++ b/fs/hfsplus/Kconfig
@@ -2,6 +2,7 @@
config HFSPLUS_FS
tristate "Apple Extended HFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
select NLS
select NLS_UTF8
select LEGACY_DIRECT_IO
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index ec975f4668775f..ac1e9318e65a4a 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -2,6 +2,7 @@
config HPFS_FS
tristate "OS/2 HPFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
select FS_IOMAP
help
OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig
index 08ffd37b9bb8f6..51434f2a471b0f 100644
--- a/fs/isofs/Kconfig
+++ b/fs/isofs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config ISO9660_FS
tristate "ISO 9660 CDROM file system support"
+ select BUFFER_HEAD
help
This is the standard file system used on CD-ROMs. It was previously
known as "High Sierra File System" and is called "hsfs" on other
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
index 51e856f0e4b8d6..17488440eef1a9 100644
--- a/fs/jfs/Kconfig
+++ b/fs/jfs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config JFS_FS
tristate "JFS filesystem support"
+ select BUFFER_HEAD
select NLS
select CRC32
select LEGACY_DIRECT_IO
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index de2003974ff0d0..90ddfad2a75e8f 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -2,6 +2,7 @@
config MINIX_FS
tristate "Minix file system support"
depends on BLOCK
+ select BUFFER_HEAD
help
Minix is a simple operating system used in many classes about OS's.
The minix file system (method to organize files on a hard disk
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 7d59567465e121..7dae168e346e30 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config NILFS2_FS
tristate "NILFS2 file system support"
+ select BUFFER_HEAD
select CRC32
select LEGACY_DIRECT_IO
help
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
index f93e69a612833f..7b2509741735a9 100644
--- a/fs/ntfs/Kconfig
+++ b/fs/ntfs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config NTFS_FS
tristate "NTFS file system support"
+ select BUFFER_HEAD
select NLS
help
NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig
index 96cc236f7f7bd3..cdfdf51e55d797 100644
--- a/fs/ntfs3/Kconfig
+++ b/fs/ntfs3/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config NTFS3_FS
tristate "NTFS Read-Write file system support"
+ select BUFFER_HEAD
select NLS
select LEGACY_DIRECT_IO
help
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 3123da7cfb301f..2514d36cbe0157 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -2,6 +2,7 @@
config OCFS2_FS
tristate "OCFS2 file system support"
depends on INET && SYSFS && CONFIGFS_FS
+ select BUFFER_HEAD
select JBD2
select CRC32
select QUOTA
diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig
index 42b2ec35a05bfb..8470f6c3e64e6a 100644
--- a/fs/omfs/Kconfig
+++ b/fs/omfs/Kconfig
@@ -2,6 +2,7 @@
config OMFS_FS
tristate "SonicBlue Optimized MPEG File System support"
depends on BLOCK
+ select BUFFER_HEAD
select CRC_ITU_T
help
This is the proprietary file system used by the Rio Karma music
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
index 45b5b98376c436..a2eb826e76c602 100644
--- a/fs/qnx4/Kconfig
+++ b/fs/qnx4/Kconfig
@@ -2,6 +2,7 @@
config QNX4FS_FS
tristate "QNX4 file system support (read only)"
depends on BLOCK
+ select BUFFER_HEAD
help
This is the file system used by the real-time operating systems
QNX 4 and QNX 6 (the latter is also called QNX RTP).
diff --git a/fs/qnx6/Kconfig b/fs/qnx6/Kconfig
index 6a9d6bce158622..8e865d72204e75 100644
--- a/fs/qnx6/Kconfig
+++ b/fs/qnx6/Kconfig
@@ -2,6 +2,7 @@
config QNX6FS_FS
tristate "QNX6 file system support (read only)"
depends on BLOCK && CRC32
+ select BUFFER_HEAD
help
This is the file system used by the real-time operating systems
QNX 6 (also called QNX RTP).
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 4d22ecfe0fab65..0e6fe26458fede 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config REISERFS_FS
tristate "Reiserfs support (deprecated)"
+ select BUFFER_HEAD
select CRC32
select LEGACY_DIRECT_IO
help
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
index b4e23e03fbeba3..67b3f90afbfd67 100644
--- a/fs/sysv/Kconfig
+++ b/fs/sysv/Kconfig
@@ -2,6 +2,7 @@
config SYSV_FS
tristate "System V/Xenix/V7/Coherent file system support"
depends on BLOCK
+ select BUFFER_HEAD
help
SCO, Xenix and Coherent are commercial Unix systems for Intel
machines, and Version 7 was used on the DEC PDP-11. Saying Y
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 82e8bfa2dfd989..8f7ce30d47fdce 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config UDF_FS
tristate "UDF file system support"
+ select BUFFER_HEAD
select CRC_ITU_T
select NLS
select LEGACY_DIRECT_IO
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index 6d30adb6b890fc..9301e7ecd09210 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -2,6 +2,7 @@
config UFS_FS
tristate "UFS file system support (read only)"
depends on BLOCK
+ select BUFFER_HEAD
help
BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7002a9ff63a3da..c89ef50d5112fc 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -16,8 +16,6 @@
#include <linux/wait.h>
#include <linux/atomic.h>

-#ifdef CONFIG_BLOCK
-
enum bh_state_bits {
BH_Uptodate, /* Contains valid data */
BH_Dirty, /* Is dirty */
@@ -198,7 +196,6 @@ void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset);
void folio_set_bh(struct buffer_head *bh, struct folio *folio,
unsigned long offset);
-bool try_to_free_buffers(struct folio *);
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
bool retry);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
@@ -213,10 +210,6 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate);

/* Things to do with buffers at mapping->private_list */
void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
-int inode_has_buffers(struct inode *);
-void invalidate_inode_buffers(struct inode *);
-int remove_inode_buffers(struct inode *inode);
-int sync_mapping_buffers(struct address_space *mapping);
int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
bool datasync);
int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
@@ -240,9 +233,6 @@ void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
struct buffer_head *__bread_gfp(struct block_device *,
sector_t block, unsigned size, gfp_t gfp);
-void invalidate_bh_lrus(void);
-void invalidate_bh_lrus_cpu(void);
-bool has_bh_in_lru(int cpu, void *dummy);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh);
@@ -258,8 +248,6 @@ int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
void __bh_read_batch(int nr, struct buffer_head *bhs[],
blk_opf_t op_flags, bool force_lock);

-extern int buffer_heads_over_limit;
-
/*
* Generic address_space_operations implementations for buffer_head-backed
* address_spaces.
@@ -304,8 +292,6 @@ extern int buffer_migrate_folio_norefs(struct address_space *,
#define buffer_migrate_folio_norefs NULL
#endif

-void buffer_init(void);
-
/*
* inline definitions
*/
@@ -465,7 +451,20 @@ __bread(struct block_device *bdev, sector_t block, unsigned size)

bool block_dirty_folio(struct address_space *mapping, struct folio *folio);

-#else /* CONFIG_BLOCK */
+#ifdef CONFIG_BUFFER_HEAD
+
+void buffer_init(void);
+bool try_to_free_buffers(struct folio *folio);
+int inode_has_buffers(struct inode *inode);
+void invalidate_inode_buffers(struct inode *inode);
+int remove_inode_buffers(struct inode *inode);
+int sync_mapping_buffers(struct address_space *mapping);
+void invalidate_bh_lrus(void);
+void invalidate_bh_lrus_cpu(void);
+bool has_bh_in_lru(int cpu, void *dummy);
+extern int buffer_heads_over_limit;
+
+#else /* CONFIG_BUFFER_HEAD */

static inline void buffer_init(void) {}
static inline bool try_to_free_buffers(struct folio *folio) { return true; }
@@ -473,9 +472,10 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; }
static inline void invalidate_inode_buffers(struct inode *inode) {}
static inline int remove_inode_buffers(struct inode *inode) { return 1; }
static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
+static inline void invalidate_bh_lrus(void) {}
static inline void invalidate_bh_lrus_cpu(void) {}
static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; }
#define buffer_heads_over_limit 0

-#endif /* CONFIG_BLOCK */
+#endif /* CONFIG_BUFFER_HEAD */
#endif /* _LINUX_BUFFER_HEAD_H */
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e2b836c2e119ae..54f50d34fd9d4f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -58,7 +58,11 @@ struct vm_fault;
#define IOMAP_F_DIRTY (1U << 1)
#define IOMAP_F_SHARED (1U << 2)
#define IOMAP_F_MERGED (1U << 3)
+#ifdef CONFIG_BUFFER_HEAD
#define IOMAP_F_BUFFER_HEAD (1U << 4)
+#else
+#define IOMAP_F_BUFFER_HEAD 0
+#endif /* CONFIG_BUFFER_HEAD */
#define IOMAP_F_XATTR (1U << 5)

/*
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 40e60c33cc6f3d..0e128ad5146015 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -12,6 +12,7 @@

#define RWBS_LEN 8

+#ifdef CONFIG_BUFFER_HEAD
DECLARE_EVENT_CLASS(block_buffer,

TP_PROTO(struct buffer_head *bh),
@@ -61,6 +62,7 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer,

TP_ARGS(bh)
);
+#endif /* CONFIG_BUFFER_HEAD */

/**
* block_rq_requeue - place block IO request back on a queue
diff --git a/mm/migrate.c b/mm/migrate.c
index 24baad2571e314..fe6f8d454aff83 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -684,7 +684,7 @@ int migrate_folio(struct address_space *mapping, struct folio *dst,
}
EXPORT_SYMBOL(migrate_folio);

-#ifdef CONFIG_BLOCK
+#ifdef CONFIG_BUFFER_HEAD
/* Returns true if all buffers are successfully locked */
static bool buffer_migrate_lock_buffers(struct buffer_head *head,
enum migrate_mode mode)
@@ -837,7 +837,7 @@ int buffer_migrate_folio_norefs(struct address_space *mapping,
return __buffer_migrate_folio(mapping, dst, src, mode, true);
}
EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
-#endif
+#endif /* CONFIG_BUFFER_HEAD */

int filemap_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode)
--
2.39.2


2023-08-01 19:43:14

by Hannes Reinecke

[permalink] [raw]
Subject: Re: [PATCH 3/6] block: open code __generic_file_write_iter for blkdev writes

On 8/1/23 19:21, Christoph Hellwig wrote:
> Open code __generic_file_write_iter to remove the indirect call into
> ->direct_IO and to prepare using the iomap based write code.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> block/fops.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 43 insertions(+), 2 deletions(-)
>
Reviewed-by: Hannes Reinecke <[email protected]>

Cheers,

Hannes



2023-08-01 20:02:06

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 5/6] block: use iomap for writes to block devices

Use iomap in buffer_head compat mode to write to block devices.

Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Luis Chamberlain <[email protected]>
Reviewed-by: Pankaj Raghav <[email protected]>
Reviewed-by: Hannes Reinecke <[email protected]>
---
block/Kconfig | 1 +
block/fops.c | 31 +++++++++++++++++++++++++++++--
2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index 86122e459fe046..1a13ef0b1ca10c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,6 +5,7 @@
menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
+ select FS_IOMAP
select SBITMAP
help
Provide block layer support for the kernel.
diff --git a/block/fops.c b/block/fops.c
index f0b822c28ddfe2..063ece37d44e44 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -15,6 +15,7 @@
#include <linux/falloc.h>
#include <linux/suspend.h>
#include <linux/fs.h>
+#include <linux/iomap.h>
#include <linux/module.h>
#include "blk.h"

@@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
}

+static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+ struct block_device *bdev = I_BDEV(inode);
+ loff_t isize = i_size_read(inode);
+
+ iomap->bdev = bdev;
+ iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
+ if (iomap->offset >= isize)
+ return -EIO;
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = iomap->offset;
+ iomap->length = isize - iomap->offset;
+ iomap->flags |= IOMAP_F_BUFFER_HEAD;
+ return 0;
+}
+
+static const struct iomap_ops blkdev_iomap_ops = {
+ .iomap_begin = blkdev_iomap_begin,
+};
+
static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, blkdev_get_block, wbc);
@@ -556,6 +578,11 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
return written;
}

+static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
+}
+
/*
* Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device.
@@ -605,9 +632,9 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = blkdev_direct_write(iocb, from);
if (ret >= 0 && iov_iter_count(from))
ret = direct_write_fallback(iocb, from, ret,
- generic_perform_write(iocb, from));
+ blkdev_buffered_write(iocb, from));
} else {
- ret = generic_perform_write(iocb, from);
+ ret = blkdev_buffered_write(iocb, from);
}

if (ret > 0)
--
2.39.2


2023-08-01 20:17:07

by Luis Chamberlain

[permalink] [raw]
Subject: Re: [PATCH 3/6] block: open code __generic_file_write_iter for blkdev writes

On Tue, Aug 01, 2023 at 07:21:58PM +0200, Christoph Hellwig wrote:
> Open code __generic_file_write_iter to remove the indirect call into
> ->direct_IO and to prepare using the iomap based write code.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Reviewed-by: Luis Chamberlain <[email protected]>

Luis

2023-08-02 07:50:29

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH 1/6] fs: remove emergency_thaw_bdev

On Tue, Aug 01, 2023 at 07:21:56PM +0200, Christoph Hellwig wrote:
> Fold emergency_thaw_bdev into it's only caller, to prepare for buffer.c
> to be built only when buffer_head support is enabled.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Reviewed-by: Luis Chamberlain <[email protected]>
> Reviewed-by: Hannes Reinecke <[email protected]>
> Reviewed-by: Johannes Thumshirn <[email protected]>
> ---

Reviewed-by: Christian Brauner <[email protected]>

2023-08-02 07:56:10

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH 5/6] block: use iomap for writes to block devices

On Tue, Aug 01, 2023 at 07:22:00PM +0200, Christoph Hellwig wrote:
> Use iomap in buffer_head compat mode to write to block devices.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Reviewed-by: Luis Chamberlain <[email protected]>
> Reviewed-by: Pankaj Raghav <[email protected]>
> Reviewed-by: Hannes Reinecke <[email protected]>
> ---

Reviewed-by: Christian Brauner <[email protected]>

2023-08-02 08:07:00

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH 3/6] block: open code __generic_file_write_iter for blkdev writes

On Tue, Aug 01, 2023 at 07:21:58PM +0200, Christoph Hellwig wrote:
> Open code __generic_file_write_iter to remove the indirect call into
> ->direct_IO and to prepare using the iomap based write code.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---

Reviewed-by: Christian Brauner <[email protected]>

2023-08-02 12:24:05

by Johannes Thumshirn

[permalink] [raw]
Subject: Re: [PATCH 6/6] fs: add CONFIG_BUFFER_HEAD

Looks good,
Reviewed-by: Johannes Thumshirn <[email protected]>

2023-08-02 14:10:44

by Johannes Thumshirn

[permalink] [raw]
Subject: Re: [PATCH 5/6] block: use iomap for writes to block devices

Looks good,
Reviewed-by: Johannes Thumshirn <[email protected]>

2023-08-02 16:50:58

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH 1/6] fs: remove emergency_thaw_bdev


On Tue, 01 Aug 2023 19:21:56 +0200, Christoph Hellwig wrote:
> Fold emergency_thaw_bdev into it's only caller, to prepare for buffer.c
> to be built only when buffer_head support is enabled.
>
>

Applied, thanks!

[1/6] fs: remove emergency_thaw_bdev
commit: 4a8b719f95c0dcd15fb7a04b806ad8139fa7c850
[2/6] fs: rename and move block_page_mkwrite_return
commit: 2ba39cc46bfe463cb9673bf62a04c4c21942f1f2
[3/6] block: open code __generic_file_write_iter for blkdev writes
commit: 727cfe976758b79f8d2f8051c75a5ccb14539a56
[4/6] block: stop setting ->direct_IO
commit: a05f7bd9578b17521a9a5f3689f3934c082c6390
[5/6] block: use iomap for writes to block devices
commit: 487c607df790d366e67a7d6a30adf785cdd98e55
[6/6] fs: add CONFIG_BUFFER_HEAD
commit: 925c86a19bacf8ce10eb666328fb3fa5aff7b951

Best regards,
--
Jens Axboe




2023-08-29 13:55:43

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 3/6] block: open code __generic_file_write_iter for blkdev writes

On Tue, Aug 29, 2023 at 03:06:14AM +0100, Al Viro wrote:
> On Tue, Aug 01, 2023 at 07:21:58PM +0200, Christoph Hellwig wrote:
> > @@ -569,7 +594,23 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
> > iov_iter_truncate(from, size);
> > }
> >
> > - ret = __generic_file_write_iter(iocb, from);
> > + ret = file_remove_privs(file);
> > + if (ret)
> > + return ret;
>
> That chunk is a bit of a WTF generator... Thankfully,
>
> static int __file_remove_privs(struct file *file, unsigned int flags)
> {
> struct dentry *dentry = file_dentry(file);
> struct inode *inode = file_inode(file);
> int error = 0;
> int kill;
>
> if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
> return 0;
>
> means that it's really a no-op. But I'd still suggest
> removing it, just to reduce the amount of head-scratching
> for people who'll be reading that code later...

I'll send an incremental patch to remove it once the changes hit
Linus' tree.

2024-04-26 10:38:58

by Xu Yang

[permalink] [raw]
Subject: Re: [PATCH 5/6] block: use iomap for writes to block devices

Hi Christoph,

On Tue, Aug 01, 2023 at 07:22:00PM +0200, Christoph Hellwig wrote:
> Use iomap in buffer_head compat mode to write to block devices.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Reviewed-by: Luis Chamberlain <[email protected]>
> Reviewed-by: Pankaj Raghav <[email protected]>
> Reviewed-by: Hannes Reinecke <[email protected]>
> ---
> block/Kconfig | 1 +
> block/fops.c | 31 +++++++++++++++++++++++++++++--
> 2 files changed, 30 insertions(+), 2 deletions(-)
>
> diff --git a/block/Kconfig b/block/Kconfig
> index 86122e459fe046..1a13ef0b1ca10c 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -5,6 +5,7 @@
> menuconfig BLOCK
> bool "Enable the block layer" if EXPERT
> default y
> + select FS_IOMAP
> select SBITMAP
> help
> Provide block layer support for the kernel.
> diff --git a/block/fops.c b/block/fops.c
> index f0b822c28ddfe2..063ece37d44e44 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -15,6 +15,7 @@
> #include <linux/falloc.h>
> #include <linux/suspend.h>
> #include <linux/fs.h>
> +#include <linux/iomap.h>
> #include <linux/module.h>
> #include "blk.h"
>
> @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
> }
>
> +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> + unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
> +{
> + struct block_device *bdev = I_BDEV(inode);
> + loff_t isize = i_size_read(inode);
> +
> + iomap->bdev = bdev;
> + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
> + if (iomap->offset >= isize)
> + return -EIO;
> + iomap->type = IOMAP_MAPPED;
> + iomap->addr = iomap->offset;
> + iomap->length = isize - iomap->offset;
> + iomap->flags |= IOMAP_F_BUFFER_HEAD;
> + return 0;
> +}
> +
> +static const struct iomap_ops blkdev_iomap_ops = {
> + .iomap_begin = blkdev_iomap_begin,
> +};
> +
> static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
> {
> return block_write_full_page(page, blkdev_get_block, wbc);
> @@ -556,6 +578,11 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
> return written;
> }
>
> +static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
> +{
> + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
> +}
> +
> /*
> * Write data to the block device. Only intended for the block device itself
> * and the raw driver which basically is a fake block device.
> @@ -605,9 +632,9 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
> ret = blkdev_direct_write(iocb, from);
> if (ret >= 0 && iov_iter_count(from))
> ret = direct_write_fallback(iocb, from, ret,
> - generic_perform_write(iocb, from));
> + blkdev_buffered_write(iocb, from));
> } else {
> - ret = generic_perform_write(iocb, from);
> + ret = blkdev_buffered_write(iocb, from);
> }
>
> if (ret > 0)

I'm testing SSD block device write performance recently. I found the write
speed descrased greatly on my board (330MB/s -> 130MB/s). Then I spent some
time to find cause, finally find that it's caused by this patch and if I
revert this patch, write speed can recover to 330MB/s.

I'm using below command to test write performance:
dd if=/dev/zero of=/dev/sda bs=4M count=1024

And I also do more tests to get more findings. In short, I found write
speed changes with the "bs=" parameter.

I totally write 4GB data to sda for each test, the results as below:

- dd if=/dev/zero of=/dev/sda bs=400K count=10485 (334 MB/s)
- dd if=/dev/zero of=/dev/sda bs=800K count=5242 (278 MB/s)
- dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (204 MB/s)
- dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (170 MB/s)
- dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (150 MB/s)
- dd if=/dev/zero of=/dev/sda bs=4500K count=932 (139 MB/s)

When this patch reverted, I got below results:

- dd if=/dev/zero of=/dev/sda bs=400K count=10485 (339 MB/s)
- dd if=/dev/zero of=/dev/sda bs=800K count=5242 (330 MB/s)
- dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (332 MB/s)
- dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (333 MB/s)
- dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (333 MB/s)
- dd if=/dev/zero of=/dev/sda bs=4500K count=932 (333 MB/s)

I just want to know if this results is expected when uses iomap, or it's
a real issue?

Many thanks in advance!

Best Regards,
Xu Yang

> --
> 2.39.2
>

2024-05-08 01:47:26

by Xu Yang

[permalink] [raw]
Subject: Re: [PATCH 5/6] block: use iomap for writes to block devices

On Fri, Apr 26, 2024 at 06:37:27PM +0800, Xu Yang wrote:
> Hi Christoph,
>
> On Tue, Aug 01, 2023 at 07:22:00PM +0200, Christoph Hellwig wrote:
> > Use iomap in buffer_head compat mode to write to block devices.
> >
> > Signed-off-by: Christoph Hellwig <[email protected]>
> > Reviewed-by: Luis Chamberlain <[email protected]>
> > Reviewed-by: Pankaj Raghav <[email protected]>
> > Reviewed-by: Hannes Reinecke <[email protected]>
> > ---
> > block/Kconfig | 1 +
> > block/fops.c | 31 +++++++++++++++++++++++++++++--
> > 2 files changed, 30 insertions(+), 2 deletions(-)
> >
> > diff --git a/block/Kconfig b/block/Kconfig
> > index 86122e459fe046..1a13ef0b1ca10c 100644
> > --- a/block/Kconfig
> > +++ b/block/Kconfig
> > @@ -5,6 +5,7 @@
> > menuconfig BLOCK
> > bool "Enable the block layer" if EXPERT
> > default y
> > + select FS_IOMAP
> > select SBITMAP
> > help
> > Provide block layer support for the kernel.
> > diff --git a/block/fops.c b/block/fops.c
> > index f0b822c28ddfe2..063ece37d44e44 100644
> > --- a/block/fops.c
> > +++ b/block/fops.c
> > @@ -15,6 +15,7 @@
> > #include <linux/falloc.h>
> > #include <linux/suspend.h>
> > #include <linux/fs.h>
> > +#include <linux/iomap.h>
> > #include <linux/module.h>
> > #include "blk.h"
> >
> > @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> > return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
> > }
> >
> > +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> > + unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
> > +{
> > + struct block_device *bdev = I_BDEV(inode);
> > + loff_t isize = i_size_read(inode);
> > +
> > + iomap->bdev = bdev;
> > + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
> > + if (iomap->offset >= isize)
> > + return -EIO;
> > + iomap->type = IOMAP_MAPPED;
> > + iomap->addr = iomap->offset;
> > + iomap->length = isize - iomap->offset;
> > + iomap->flags |= IOMAP_F_BUFFER_HEAD;
> > + return 0;
> > +}
> > +
> > +static const struct iomap_ops blkdev_iomap_ops = {
> > + .iomap_begin = blkdev_iomap_begin,
> > +};
> > +
> > static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
> > {
> > return block_write_full_page(page, blkdev_get_block, wbc);
> > @@ -556,6 +578,11 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
> > return written;
> > }
> >
> > +static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
> > +{
> > + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
> > +}
> > +
> > /*
> > * Write data to the block device. Only intended for the block device itself
> > * and the raw driver which basically is a fake block device.
> > @@ -605,9 +632,9 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
> > ret = blkdev_direct_write(iocb, from);
> > if (ret >= 0 && iov_iter_count(from))
> > ret = direct_write_fallback(iocb, from, ret,
> > - generic_perform_write(iocb, from));
> > + blkdev_buffered_write(iocb, from));
> > } else {
> > - ret = generic_perform_write(iocb, from);
> > + ret = blkdev_buffered_write(iocb, from);
> > }
> >
> > if (ret > 0)
>
> I'm testing SSD block device write performance recently. I found the write
> speed descrased greatly on my board (330MB/s -> 130MB/s). Then I spent some
> time to find cause, finally find that it's caused by this patch and if I
> revert this patch, write speed can recover to 330MB/s.
>
> I'm using below command to test write performance:
> dd if=/dev/zero of=/dev/sda bs=4M count=1024
>
> And I also do more tests to get more findings. In short, I found write
> speed changes with the "bs=" parameter.
>
> I totally write 4GB data to sda for each test, the results as below:
>
> - dd if=/dev/zero of=/dev/sda bs=400K count=10485 (334 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=800K count=5242 (278 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (204 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (170 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (150 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=4500K count=932 (139 MB/s)
>
> When this patch reverted, I got below results:
>
> - dd if=/dev/zero of=/dev/sda bs=400K count=10485 (339 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=800K count=5242 (330 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (332 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (333 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (333 MB/s)
> - dd if=/dev/zero of=/dev/sda bs=4500K count=932 (333 MB/s)
>
> I just want to know if this results is expected when uses iomap, or it's
> a real issue?
>
> Many thanks in advance!

A gentle ping.

>
> Best Regards,
> Xu Yang
>
> > --
> > 2.39.2
> >