Hi all,
after all the talk about removing buffer_heads, here is a series that
shows how to build a kernel without buffer_heads. And how unrealistic
it is to remove the entirely.
Most of the series refactors some common code to make implementing direct
I/O easier without use of the ->direct_IO method and the helpers based
around it. It then switches buffered writes (but not writeback) for
block devices to use iomap unconditionally, but still using buffer_heads.
The final patch then adds a CONFIG_BUFFER_HEAD selected by all file
systems that need it (which is most block based file systems), makes the
buffer_head support in iomap optional, and adds an alternative
implementation of the block device address_operations using iomap.
With this you can build a kernel with block device support, but without
buffer_heads. This kernel supports xfs and btrfs as full blown block
based filesystems, and a bunch of read-only ones like cramfs, erofs and
squashfs. Note that the md software raid drivers is also disabled as it
has some (rather questionable) buffer_head usage in the unconditionally
built bitmap code.
The series is based on Linux 6.3 and will need some rebasing before it
can be fed to the maintainers incrementally. All but the last patch
definitively seem useful for me. The last one I think is just to avoid
introducing new buffer_head dependencies, even if I suspect the real
life usefulness of a !CONFIG_BUFFER_HEAD kernel might be rather limited.
All callers of generic_perform_write need to updated ki_pos, move it into
common code.
Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/ceph/file.c | 2 --
fs/ext4/file.c | 9 +++------
fs/f2fs/file.c | 1 -
fs/nfs/file.c | 1 -
mm/filemap.c | 8 ++++----
5 files changed, 7 insertions(+), 14 deletions(-)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index f4d8bf7dec88a8..feeb9882ef635a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1894,8 +1894,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
* can not run at the same time
*/
written = generic_perform_write(iocb, from);
- if (likely(written >= 0))
- iocb->ki_pos = pos + written;
ceph_end_io_write(inode);
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0b8b4499e5ca18..1026acaf1235a0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -291,12 +291,9 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
out:
inode_unlock(inode);
- if (likely(ret > 0)) {
- iocb->ki_pos += ret;
- ret = generic_write_sync(iocb, ret);
- }
-
- return ret;
+ if (unlikely(ret <= 0))
+ return ret;
+ return generic_write_sync(iocb, ret);
}
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f4ab23efcf85f8..5a9ae054b6da7d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -4511,7 +4511,6 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb,
current->backing_dev_info = NULL;
if (ret > 0) {
- iocb->ki_pos += ret;
f2fs_update_iostat(F2FS_I_SB(inode), inode,
APP_BUFFERED_IO, ret);
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 893625eacab9fa..abdae2b29369be 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -666,7 +666,6 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
written = result;
- iocb->ki_pos += written;
nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
if (mntflags & NFS_MOUNT_WRITE_EAGER) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 2723104cc06a12..0110bde3708b3f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3960,7 +3960,10 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
- return written ? written : status;
+ if (!written)
+ return status;
+ iocb->ki_pos += written;
+ return written;
}
EXPORT_SYMBOL(generic_perform_write);
@@ -4039,7 +4042,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
endbyte = pos + status - 1;
err = filemap_write_and_wait_range(mapping, pos, endbyte);
if (err == 0) {
- iocb->ki_pos = endbyte + 1;
written += status;
invalidate_mapping_pages(mapping,
pos >> PAGE_SHIFT,
@@ -4052,8 +4054,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
} else {
written = generic_perform_write(iocb, from);
- if (likely(written > 0))
- iocb->ki_pos += written;
}
out:
current->backing_dev_info = NULL;
--
2.39.2
On Mon, Apr 24, 2023 at 8:22 AM Christoph Hellwig <[email protected]> wrote:
> All callers of generic_perform_write need to updated ki_pos, move it into
> common code.
We've actually got a similar situation with
iomap_file_buffered_write() and its callers. Would it make sense to
fix that up as well?
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/ceph/file.c | 2 --
> fs/ext4/file.c | 9 +++------
> fs/f2fs/file.c | 1 -
> fs/nfs/file.c | 1 -
> mm/filemap.c | 8 ++++----
> 5 files changed, 7 insertions(+), 14 deletions(-)
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index f4d8bf7dec88a8..feeb9882ef635a 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -1894,8 +1894,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> * can not run at the same time
> */
> written = generic_perform_write(iocb, from);
> - if (likely(written >= 0))
> - iocb->ki_pos = pos + written;
> ceph_end_io_write(inode);
> }
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 0b8b4499e5ca18..1026acaf1235a0 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -291,12 +291,9 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>
> out:
> inode_unlock(inode);
> - if (likely(ret > 0)) {
> - iocb->ki_pos += ret;
> - ret = generic_write_sync(iocb, ret);
> - }
> -
> - return ret;
> + if (unlikely(ret <= 0))
> + return ret;
> + return generic_write_sync(iocb, ret);
> }
>
> static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index f4ab23efcf85f8..5a9ae054b6da7d 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -4511,7 +4511,6 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb,
> current->backing_dev_info = NULL;
>
> if (ret > 0) {
> - iocb->ki_pos += ret;
> f2fs_update_iostat(F2FS_I_SB(inode), inode,
> APP_BUFFERED_IO, ret);
> }
> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> index 893625eacab9fa..abdae2b29369be 100644
> --- a/fs/nfs/file.c
> +++ b/fs/nfs/file.c
> @@ -666,7 +666,6 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
> goto out;
>
> written = result;
> - iocb->ki_pos += written;
> nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
>
> if (mntflags & NFS_MOUNT_WRITE_EAGER) {
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 2723104cc06a12..0110bde3708b3f 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3960,7 +3960,10 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
> balance_dirty_pages_ratelimited(mapping);
> } while (iov_iter_count(i));
>
> - return written ? written : status;
> + if (!written)
> + return status;
> + iocb->ki_pos += written;
Could be turned into:
iocb->ki_pos = pos;
> + return written;
> }
> EXPORT_SYMBOL(generic_perform_write);
>
> @@ -4039,7 +4042,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
> endbyte = pos + status - 1;
> err = filemap_write_and_wait_range(mapping, pos, endbyte);
> if (err == 0) {
> - iocb->ki_pos = endbyte + 1;
> written += status;
> invalidate_mapping_pages(mapping,
> pos >> PAGE_SHIFT,
> @@ -4052,8 +4054,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
> }
> } else {
> written = generic_perform_write(iocb, from);
> - if (likely(written > 0))
> - iocb->ki_pos += written;
> }
> out:
> current->backing_dev_info = NULL;
> --
> 2.39.2
>
Thanks,
Andreas