From: Andreas Hindborg <[email protected]>
Hi All!
Kernel robot found a few issues with the first iteration of this patch [1]. I
also rebased the patch on the Rust PR for 6.10 [2], because we have some changes
to allocation going in, and this patch needs updates for those changes.
This is a resend to correct those issues.
Changes from v1:
- Fix paths in doc comments so they are correct and `rustdoc` does not complain.
- Fix a typo regarding stabilization of `const_refs_to_static`.
- Properly gate `to_blk_status` behind `CONFIG_BLOCK`.
- Update doc comment, a sector is usually 4096 bytes, not 512.
- Update doc comment, use consistent unit names.
- Rebase on `rust-6.10`.
I did not change the interface to use bytes rather than sectors, even though I
like the idea. I think it is preferable to have some similarity to the C API for
now.
Best regards,
Andreas Hindborg
Link: https://lore.kernel.org/all/[email protected]/ [1]
Link: https://lore.kernel.org/all/[email protected]/ [2]
Andreas Hindborg (3):
rust: block: introduce `kernel::block::mq` module
rust: block: add rnull, Rust null_blk implementation
MAINTAINERS: add entry for Rust block device driver API
MAINTAINERS | 14 ++
drivers/block/Kconfig | 9 ++
drivers/block/Makefile | 3 +
drivers/block/rnull.rs | 86 ++++++++++
rust/bindings/bindings_helper.h | 2 +
rust/helpers.c | 16 ++
rust/kernel/block.rs | 5 +
rust/kernel/block/mq.rs | 99 ++++++++++++
rust/kernel/block/mq/gen_disk.rs | 206 ++++++++++++++++++++++++
rust/kernel/block/mq/operations.rs | 245 +++++++++++++++++++++++++++++
rust/kernel/block/mq/raw_writer.rs | 55 +++++++
rust/kernel/block/mq/request.rs | 227 ++++++++++++++++++++++++++
rust/kernel/block/mq/tag_set.rs | 93 +++++++++++
rust/kernel/error.rs | 6 +
rust/kernel/lib.rs | 2 +
15 files changed, 1068 insertions(+)
create mode 100644 drivers/block/rnull.rs
create mode 100644 rust/kernel/block.rs
create mode 100644 rust/kernel/block/mq.rs
create mode 100644 rust/kernel/block/mq/gen_disk.rs
create mode 100644 rust/kernel/block/mq/operations.rs
create mode 100644 rust/kernel/block/mq/raw_writer.rs
create mode 100644 rust/kernel/block/mq/request.rs
create mode 100644 rust/kernel/block/mq/tag_set.rs
base-commit: 97ab3e8eec0ce79d9e265e6c9e4c480492180409
--
2.44.0
From: Andreas Hindborg <[email protected]>
Add initial abstractions for working with blk-mq.
This patch is a maintained, refactored subset of code originally published
by Wedson Almeida Filho <[email protected]> [1].
[1] https://github.com/wedsonaf/linux/tree/f2cfd2fe0e2ca4e90994f96afe268bbd4382a891/rust/kernel/blk/mq.rs
Cc: Wedson Almeida Filho <[email protected]>
Signed-off-by: Andreas Hindborg <[email protected]>
---
rust/bindings/bindings_helper.h | 2 +
rust/helpers.c | 16 ++
rust/kernel/block.rs | 5 +
rust/kernel/block/mq.rs | 97 ++++++++++++
rust/kernel/block/mq/gen_disk.rs | 206 ++++++++++++++++++++++++
rust/kernel/block/mq/operations.rs | 245 +++++++++++++++++++++++++++++
rust/kernel/block/mq/raw_writer.rs | 55 +++++++
rust/kernel/block/mq/request.rs | 227 ++++++++++++++++++++++++++
rust/kernel/block/mq/tag_set.rs | 93 +++++++++++
rust/kernel/error.rs | 6 +
rust/kernel/lib.rs | 2 +
11 files changed, 954 insertions(+)
create mode 100644 rust/kernel/block.rs
create mode 100644 rust/kernel/block/mq.rs
create mode 100644 rust/kernel/block/mq/gen_disk.rs
create mode 100644 rust/kernel/block/mq/operations.rs
create mode 100644 rust/kernel/block/mq/raw_writer.rs
create mode 100644 rust/kernel/block/mq/request.rs
create mode 100644 rust/kernel/block/mq/tag_set.rs
diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h
index ddb5644d4fd9..68f937f8374f 100644
--- a/rust/bindings/bindings_helper.h
+++ b/rust/bindings/bindings_helper.h
@@ -7,6 +7,8 @@
*/
#include <kunit/test.h>
+#include <linux/blk_types.h>
+#include <linux/blk-mq.h>
#include <linux/errname.h>
#include <linux/ethtool.h>
#include <linux/jiffies.h>
diff --git a/rust/helpers.c b/rust/helpers.c
index 4c8b7b92a4f4..3ba108095346 100644
--- a/rust/helpers.c
+++ b/rust/helpers.c
@@ -178,3 +178,19 @@ static_assert(
__alignof__(size_t) == __alignof__(uintptr_t),
"Rust code expects C `size_t` to match Rust `usize`"
);
+
+// This will soon be moved to a separate file, so no need to merge with above.
+#include <linux/blk-mq.h>
+#include <linux/blkdev.h>
+
+void *rust_helper_blk_mq_rq_to_pdu(struct request *rq)
+{
+ return blk_mq_rq_to_pdu(rq);
+}
+EXPORT_SYMBOL_GPL(rust_helper_blk_mq_rq_to_pdu);
+
+struct request *rust_helper_blk_mq_rq_from_pdu(void *pdu)
+{
+ return blk_mq_rq_from_pdu(pdu);
+}
+EXPORT_SYMBOL_GPL(rust_helper_blk_mq_rq_from_pdu);
diff --git a/rust/kernel/block.rs b/rust/kernel/block.rs
new file mode 100644
index 000000000000..150f710efe5b
--- /dev/null
+++ b/rust/kernel/block.rs
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Types for working with the block layer.
+
+pub mod mq;
diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs
new file mode 100644
index 000000000000..efbd2588791b
--- /dev/null
+++ b/rust/kernel/block/mq.rs
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! This module provides types for implementing block drivers that interface the
+//! blk-mq subsystem.
+//!
+//! To implement a block device driver, a Rust module must do the following:
+//!
+//! - Implement [`Operations`] for a type `T`
+//! - Create a [`TagSet<T>`]
+//! - Create a [`gen_disk::GenDisk<T>`], passing in the `TagSet` reference
+//! - Add the disk to the system by calling [`gen_disk::GenDisk::add`]
+//!
+//! The types available in this module that have direct C counterparts are:
+//!
+//! - The `TagSet` type that abstracts the C type `struct tag_set`.
+//! - The `GenDisk` type that abstracts the C type `struct gendisk`.
+//! - The `Request` type that abstracts the C type `struct request`.
+//!
+//! The kernel will interface with the block device driver by calling the method
+//! implementations of the `Operations` trait.
+//!
+//! IO requests are passed to the driver as [`kernel::types::ARef<Request>`]
+//! instances. The `Request` type is a wrapper around the C `struct request`.
+//! The driver must mark end of processing by calling one of the
+//! `Request::end`, methods. Failure to do so can lead to deadlock or timeout
+//! errors. Please note that the C function `blk_mq_start_request` is implicitly
+//! called when the request is queued with the driver.
+//!
+//! The `TagSet` is responsible for creating and maintaining a mapping between
+//! `Request`s and integer ids as well as carrying a pointer to the vtable
+//! generated by `Operations`. This mapping is useful for associating
+//! completions from hardware with the correct `Request` instance. The `TagSet`
+//! determines the maximum queue depth by setting the number of `Request`
+//! instances available to the driver, and it determines the number of queues to
+//! instantiate for the driver. If possible, a driver should allocate one queue
+//! per core, to keep queue data local to a core.
+//!
+//! One `TagSet` instance can be shared between multiple `GenDisk` instances.
+//! This can be useful when implementing drivers where one piece of hardware
+//! with one set of IO resources are represented to the user as multiple disks.
+//!
+//! One significant difference between block device drivers implemented with
+//! these Rust abstractions and drivers implemented in C, is that the Rust
+//! drivers have to own a reference count on the `Request` type when the IO is
+//! in flight. This is to ensure that the C `struct request` instances backing
+//! the Rust `Request` instances are live while the Rust driver holds a
+//! reference to the `Request`. In addition, the conversion of an integer tag to
+//! a `Request` via the `TagSet` would not be sound without this bookkeeping.
+//!
+//! # Example
+//!
+//! ```rust
+//! use kernel::{
+//! block::mq::*,
+//! new_mutex,
+//! prelude::*,
+//! sync::{Arc, Mutex},
+//! types::{ARef, ForeignOwnable},
+//! };
+//!
+//! struct MyBlkDevice;
+//!
+//! #[vtable]
+//! impl Operations for MyBlkDevice {
+//!
+//! fn queue_rq(rq: ARef<Request<Self>>, _is_last: bool) -> Result {
+//! Request::end_ok(rq);
+//! Ok(())
+//! }
+//!
+//! fn commit_rqs(
+//! ) {
+//! }
+//!
+//! fn complete(rq: ARef<Request<Self>>) {
+//! Request::end_ok(rq);
+//! }
+//! }
+//!
+//! let tagset: Arc<TagSet<MyBlkDevice>> = Arc::pin_init(TagSet::try_new(1, 256, 1))?;
+//! let mut disk = gen_disk::try_new(tagset)?;
+//! disk.set_name(format_args!("myblk"))?;
+//! disk.set_capacity_sectors(4096);
+//! disk.add()?;
+//!
+//! # Ok::<(), kernel::error::Error>(())
+//! ```
+
+pub mod gen_disk;
+mod operations;
+mod raw_writer;
+mod request;
+mod tag_set;
+
+pub use operations::Operations;
+pub use request::Request;
+pub use tag_set::TagSet;
diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs
new file mode 100644
index 000000000000..a88802c1f918
--- /dev/null
+++ b/rust/kernel/block/mq/gen_disk.rs
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Generic disk abstraction.
+//!
+//! C header: [`include/linux/blkdev.h`](srctree/include/linux/blkdev.h)
+//! C header: [`include/linux/blk_mq.h`](srctree/include/linux/blk_mq.h)
+
+use crate::block::mq::{raw_writer::RawWriter, Operations, TagSet};
+use crate::{bindings, error::from_err_ptr, error::Result, sync::Arc};
+use core::fmt::{self, Write};
+use core::marker::PhantomData;
+
+/// A generic block device.
+///
+/// # Invariants
+///
+/// - `gendisk` must always point to an initialized and valid `struct gendisk`.
+pub struct GenDisk<T: Operations, S: GenDiskState> {
+ _tagset: Arc<TagSet<T>>,
+ gendisk: *mut bindings::gendisk,
+ _phantom: core::marker::PhantomData<S>,
+}
+
+// SAFETY: `GenDisk` is an owned pointer to a `struct gendisk` and an `Arc` to a
+// `TagSet` It is safe to send this to other threads as long as T is Send.
+unsafe impl<T: Operations + Send, S: GenDiskState> Send for GenDisk<T, S> {}
+
+/// Disks in this state are allocated and initialized, but are not yet
+/// accessible from the kernel VFS.
+pub enum Initialized {}
+
+/// Disks in this state have been attached to the kernel VFS and may receive IO
+/// requests.
+pub enum Added {}
+
+/// Typestate representing states of a `GenDisk`.
+pub trait GenDiskState {}
+
+impl GenDiskState for Initialized {}
+impl GenDiskState for Added {}
+
+impl<T: Operations> GenDisk<T, Initialized> {
+ /// Register the device with the kernel. When this function returns, the
+ /// device is accessible from VFS. The kernel may issue reads to the device
+ /// during registration to discover partition information.
+ pub fn add(self) -> Result<GenDisk<T, Added>> {
+ crate::error::to_result(
+ // SAFETY: By type invariant, `self.gendisk` points to a valid and
+ // initialized instance of `struct gendisk`
+ unsafe {
+ bindings::device_add_disk(
+ core::ptr::null_mut(),
+ self.gendisk,
+ core::ptr::null_mut(),
+ )
+ },
+ )?;
+
+ // We don't want to run the destuctor and remove the device from the VFS
+ // when `disk` is dropped.
+ let mut old = core::mem::ManuallyDrop::new(self);
+
+ let new = GenDisk {
+ _tagset: old._tagset.clone(),
+ gendisk: old.gendisk,
+ _phantom: PhantomData,
+ };
+
+ // But we have to drop the `Arc` or it will leak.
+ // SAFETY: `old._tagset` is valid for write, aligned, non-null, and we
+ // have exclusive access. We are not accessing the value again after it
+ // is dropped.
+ unsafe { core::ptr::drop_in_place(&mut old._tagset) };
+
+ Ok(new)
+ }
+
+ /// Set the name of the device.
+ pub fn set_name(&mut self, args: fmt::Arguments<'_>) -> Result {
+ let mut raw_writer = RawWriter::from_array(
+ // SAFETY: By type invariant `self.gendisk` points to a valid and
+ // initialized instance. We have exclusive access, since the disk is
+ // not added to the VFS yet.
+ unsafe { &mut (*self.gendisk).disk_name },
+ )?;
+ raw_writer.write_fmt(args)?;
+ raw_writer.write_char('\0')?;
+ Ok(())
+ }
+
+ /// Set the logical block size of the device.
+ ///
+ /// This is the smallest unit the storage device can address. It is
+ /// typically 4096 bytes.
+ pub fn set_queue_logical_block_size(&mut self, size: u32) {
+ // SAFETY: By type invariant, `self.gendisk` points to a valid and
+ // initialized instance of `struct gendisk`.
+ unsafe { bindings::blk_queue_logical_block_size((*self.gendisk).queue, size) };
+ }
+
+ /// Set the physical block size of the device.
+ ///
+ /// This is the smallest unit a physical storage device can write
+ /// atomically. It is usually the same as the logical block size but may be
+ /// bigger. One example is SATA drives with 4096 byte physical block size
+ /// that expose a 512 byte logical block size to the operating system.
+ pub fn set_queue_physical_block_size(&mut self, size: u32) {
+ // SAFETY: By type invariant, `self.gendisk` points to a valid and
+ // initialized instance of `struct gendisk`.
+ unsafe { bindings::blk_queue_physical_block_size((*self.gendisk).queue, size) };
+ }
+}
+
+impl<T: Operations, S: GenDiskState> GenDisk<T, S> {
+ /// Call to tell the block layer the capacity of the device in sectors (512
+ /// bytes).
+ pub fn set_capacity_sectors(&self, sectors: u64) {
+ // SAFETY: By type invariant, `self.gendisk` points to a valid and
+ // initialized instance of `struct gendisk`. Callee takes a lock to
+ // synchronize this operation, so we will not race.
+ unsafe { bindings::set_capacity(self.gendisk, sectors) };
+ }
+
+ /// Set the rotational media attribute for the device.
+ pub fn set_rotational(&self, rotational: bool) {
+ if !rotational {
+ // SAFETY: By type invariant, `self.gendisk` points to a valid and
+ // initialized instance of `struct gendisk`. This operation uses a
+ // relaxed atomic bit flip operation, so there is no race on this
+ // field.
+ unsafe {
+ bindings::blk_queue_flag_set(bindings::QUEUE_FLAG_NONROT, (*self.gendisk).queue)
+ };
+ } else {
+ // SAFETY: By type invariant, `self.gendisk` points to a valid and
+ // initialized instance of `struct gendisk`. This operation uses a
+ // relaxed atomic bit flip operation, so there is no race on this
+ // field.
+ unsafe {
+ bindings::blk_queue_flag_clear(bindings::QUEUE_FLAG_NONROT, (*self.gendisk).queue)
+ };
+ }
+ }
+}
+
+impl<T: Operations, S: GenDiskState> Drop for GenDisk<T, S> {
+ fn drop(&mut self) {
+ // TODO: This will `WARN` if the disk was not added. Since we cannot
+ // specialize drop, we have to call it, or track state with a flag.
+
+ // SAFETY: By type invariant, `self.gendisk` points to a valid and
+ // initialized instance of `struct gendisk`
+ unsafe { bindings::del_gendisk(self.gendisk) };
+ }
+}
+
+/// Try to create a new `GenDisk`.
+pub fn try_new<T: Operations>(tagset: Arc<TagSet<T>>) -> Result<GenDisk<T, Initialized>> {
+ let lock_class_key = crate::sync::LockClassKey::new();
+
+ // SAFETY: `tagset.raw_tag_set()` points to a valid and initialized tag set
+ let gendisk = from_err_ptr(unsafe {
+ bindings::__blk_mq_alloc_disk(
+ tagset.raw_tag_set(),
+ core::ptr::null_mut(), // TODO: We can pass queue limits right here
+ core::ptr::null_mut(),
+ lock_class_key.as_ptr(),
+ )
+ })?;
+
+ const TABLE: bindings::block_device_operations = bindings::block_device_operations {
+ submit_bio: None,
+ open: None,
+ release: None,
+ ioctl: None,
+ compat_ioctl: None,
+ check_events: None,
+ unlock_native_capacity: None,
+ getgeo: None,
+ set_read_only: None,
+ swap_slot_free_notify: None,
+ report_zones: None,
+ devnode: None,
+ alternative_gpt_sector: None,
+ get_unique_id: None,
+ // TODO: Set to THIS_MODULE. Waiting for const_refs_to_static feature to
+ // be merged (unstable in rustc 1.78 which is staged for linux 6.10)
+ // https://github.com/rust-lang/rust/issues/119618
+ owner: core::ptr::null_mut(),
+ pr_ops: core::ptr::null_mut(),
+ free_disk: None,
+ poll_bio: None,
+ };
+
+ // SAFETY: gendisk is a valid pointer as we initialized it above
+ unsafe { (*gendisk).fops = &TABLE };
+
+ // INVARIANT: `gendisk` was initialized above.
+ // INVARIANT: `gendisk.queue.queue_data` is set to `data` in the call to
+ // `__blk_mq_alloc_disk` above.
+ Ok(GenDisk {
+ _tagset: tagset,
+ gendisk,
+ _phantom: PhantomData,
+ })
+}
diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs
new file mode 100644
index 000000000000..3bd1af2c2260
--- /dev/null
+++ b/rust/kernel/block/mq/operations.rs
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! This module provides an interface for blk-mq drivers to implement.
+//!
+//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h)
+
+use crate::{
+ bindings,
+ block::mq::request::RequestDataWrapper,
+ block::mq::Request,
+ error::{from_result, Result},
+ types::ARef,
+};
+use core::{marker::PhantomData, sync::atomic::AtomicU64, sync::atomic::Ordering};
+
+/// Implement this trait to interface blk-mq as block devices.
+///
+/// To implement a block device driver, implement this trait as described in the
+/// [module level documentation]. The kernel will use the implementation of the
+/// functions defined in this trait to interface a block device driver. Note:
+/// There is no need for an exit_request() implementation, because the `drop`
+/// implementation of the [`Request`] type will be invoked by automatically by
+/// the C/Rust glue logic.
+///
+/// [module level documentation]: kernel::block::mq
+#[macros::vtable]
+pub trait Operations: Sized {
+ /// Called by the kernel to queue a request with the driver. If `is_last` is
+ /// `false`, the driver is allowed to defer committing the request.
+ fn queue_rq(rq: ARef<Request<Self>>, is_last: bool) -> Result;
+
+ /// Called by the kernel to indicate that queued requests should be submitted.
+ fn commit_rqs();
+
+ /// Called by the kernel when the request is completed.
+ fn complete(_rq: ARef<Request<Self>>);
+
+ /// Called by the kernel to poll the device for completed requests. Only
+ /// used for poll queues.
+ fn poll() -> bool {
+ crate::build_error(crate::error::VTABLE_DEFAULT_ERROR)
+ }
+}
+
+/// A vtable for blk-mq to interact with a block device driver.
+///
+/// A `bindings::blk_mq_opa` vtable is constructed from pointers to the `extern
+/// "C"` functions of this struct, exposed through the `OperationsVTable::VTABLE`.
+///
+/// For general documentation of these methods, see the kernel source
+/// documentation related to `struct blk_mq_operations` in
+/// [`include/linux/blk-mq.h`].
+///
+/// [`include/linux/blk-mq.h`]: srctree/include/linux/blk-mq.h
+pub(crate) struct OperationsVTable<T: Operations>(PhantomData<T>);
+
+impl<T: Operations> OperationsVTable<T> {
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// - The caller of this function must ensure `bd` is valid
+ /// and initialized. The pointees must outlive this function.
+ /// - This function must not be called with a `hctx` for which
+ /// `Self::exit_hctx_callback()` has been called.
+ /// - (*bd).rq must point to a valid `bindings:request` for which
+ /// `OperationsVTable<T>::init_request_callback` was called
+ unsafe extern "C" fn queue_rq_callback(
+ _hctx: *mut bindings::blk_mq_hw_ctx,
+ bd: *const bindings::blk_mq_queue_data,
+ ) -> bindings::blk_status_t {
+ // SAFETY: `bd.rq` is valid as required by the safety requirement for
+ // this function.
+ let request = unsafe { &*(*bd).rq.cast::<Request<T>>() };
+
+ // One refcount for the ARef, one for being in flight
+ request.wrapper_ref().refcount().store(2, Ordering::Relaxed);
+
+ let rq =
+ // SAFETY: We own a refcount that we took above. We pass that to `ARef`.
+ // By the safety requirements of this function, `request` is a valid
+ // `struct request` and the private data is properly initialized.
+ unsafe {Request::aref_from_raw((*bd).rq)};
+
+ // SAFETY: We have exclusive access and we just set the refcount above.
+ unsafe { Request::start_unchecked(&rq) };
+
+ let ret = T::queue_rq(
+ rq,
+ // SAFETY: `bd` is valid as required by the safety requirement for this function.
+ unsafe { (*bd).last },
+ );
+
+ if let Err(e) = ret {
+ e.to_blk_status()
+ } else {
+ bindings::BLK_STS_OK as _
+ }
+ }
+
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// This function may only be called by blk-mq C infrastructure.
+ unsafe extern "C" fn commit_rqs_callback(_hctx: *mut bindings::blk_mq_hw_ctx) {
+ T::commit_rqs()
+ }
+
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// This function may only be called by blk-mq C infrastructure. `rq` must
+ /// point to a valid request that has been marked as completed. The pointee
+ /// of `rq` must be valid for write for the duration of this function.
+ unsafe extern "C" fn complete_callback(rq: *mut bindings::request) {
+ // SAFETY: This function can only be dispatched through
+ // `Request::complete`. We leaked a refcount then which we pick back up
+ // now.
+ let aref = unsafe { Request::aref_from_raw(rq) };
+ T::complete(aref);
+ }
+
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// This function may only be called by blk-mq C infrastructure.
+ unsafe extern "C" fn poll_callback(
+ _hctx: *mut bindings::blk_mq_hw_ctx,
+ _iob: *mut bindings::io_comp_batch,
+ ) -> core::ffi::c_int {
+ T::poll().into()
+ }
+
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// This function may only be called by blk-mq C infrastructure. This
+ /// function may only be called onece before `exit_hctx_callback` is called
+ /// for the same context.
+ unsafe extern "C" fn init_hctx_callback(
+ _hctx: *mut bindings::blk_mq_hw_ctx,
+ _tagset_data: *mut core::ffi::c_void,
+ _hctx_idx: core::ffi::c_uint,
+ ) -> core::ffi::c_int {
+ from_result(|| Ok(0))
+ }
+
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// This function may only be called by blk-mq C infrastructure.
+ unsafe extern "C" fn exit_hctx_callback(
+ _hctx: *mut bindings::blk_mq_hw_ctx,
+ _hctx_idx: core::ffi::c_uint,
+ ) {
+ }
+
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// This function may only be called by blk-mq C infrastructure. `set` must
+ /// point to an initialized `TagSet<T>`.
+ unsafe extern "C" fn init_request_callback(
+ _set: *mut bindings::blk_mq_tag_set,
+ rq: *mut bindings::request,
+ _hctx_idx: core::ffi::c_uint,
+ _numa_node: core::ffi::c_uint,
+ ) -> core::ffi::c_int {
+ from_result(|| {
+ // SAFETY: The `blk_mq_tag_set` invariants guarantee that all
+ // requests are allocated with extra memory for the request data.
+ let pdu = unsafe { bindings::blk_mq_rq_to_pdu(rq) }.cast::<RequestDataWrapper>();
+
+ // SAFETY: The refcount field is allocated but not initialized, this
+ // valid for write.
+ unsafe { RequestDataWrapper::refcount_ptr(pdu).write(AtomicU64::new(0)) };
+
+ Ok(0)
+ })
+ }
+
+ /// This function is called by the C kernel. A pointer to this function is
+ /// installed in the `blk_mq_ops` vtable for the driver.
+ ///
+ /// # Safety
+ ///
+ /// This function may only be called by blk-mq C infrastructure. `rq` must
+ /// point to a request that was initialized by a call to
+ /// `Self::init_request_callback`.
+ unsafe extern "C" fn exit_request_callback(
+ _set: *mut bindings::blk_mq_tag_set,
+ rq: *mut bindings::request,
+ _hctx_idx: core::ffi::c_uint,
+ ) {
+ // SAFETY: The tagset invariants guarantee that all requests are allocated with extra memory
+ // for the request data.
+ let pdu = unsafe { bindings::blk_mq_rq_to_pdu(rq) }.cast::<RequestDataWrapper>();
+
+ // SAFETY: `pdu` is valid for read and write and is properly initialised.
+ unsafe { core::ptr::drop_in_place(pdu) };
+ }
+
+ const VTABLE: bindings::blk_mq_ops = bindings::blk_mq_ops {
+ queue_rq: Some(Self::queue_rq_callback),
+ queue_rqs: None,
+ commit_rqs: Some(Self::commit_rqs_callback),
+ get_budget: None,
+ put_budget: None,
+ set_rq_budget_token: None,
+ get_rq_budget_token: None,
+ timeout: None,
+ poll: if T::HAS_POLL {
+ Some(Self::poll_callback)
+ } else {
+ None
+ },
+ complete: Some(Self::complete_callback),
+ init_hctx: Some(Self::init_hctx_callback),
+ exit_hctx: Some(Self::exit_hctx_callback),
+ init_request: Some(Self::init_request_callback),
+ exit_request: Some(Self::exit_request_callback),
+ cleanup_rq: None,
+ busy: None,
+ map_queues: None,
+ #[cfg(CONFIG_BLK_DEBUG_FS)]
+ show_rq: None,
+ };
+
+ pub(crate) const fn build() -> &'static bindings::blk_mq_ops {
+ &Self::VTABLE
+ }
+}
diff --git a/rust/kernel/block/mq/raw_writer.rs b/rust/kernel/block/mq/raw_writer.rs
new file mode 100644
index 000000000000..4f7e4692b592
--- /dev/null
+++ b/rust/kernel/block/mq/raw_writer.rs
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+
+use core::fmt::{self, Write};
+
+use crate::error::Result;
+use crate::prelude::EINVAL;
+
+/// A mutable reference to a byte buffer where a string can be written into.
+///
+/// # Invariants
+///
+/// `buffer` is always null terminated.
+pub(crate) struct RawWriter<'a> {
+ buffer: &'a mut [u8],
+ pos: usize,
+}
+
+impl<'a> RawWriter<'a> {
+ /// Create a new `RawWriter` instance.
+ fn new(buffer: &'a mut [u8]) -> Result<RawWriter<'a>> {
+ *(buffer.last_mut().ok_or(EINVAL)?) = 0;
+
+ // INVARIANT: We null terminated the buffer above
+ Ok(Self { buffer, pos: 0 })
+ }
+
+ pub(crate) fn from_array<const N: usize>(
+ a: &'a mut [core::ffi::c_char; N],
+ ) -> Result<RawWriter<'a>> {
+ Self::new(
+ // SAFETY: the buffer of `a` is valid for read and write as `u8` for
+ // at least `N` bytes.
+ unsafe { core::slice::from_raw_parts_mut(a.as_mut_ptr().cast::<u8>(), N) },
+ )
+ }
+}
+
+impl Write for RawWriter<'_> {
+ fn write_str(&mut self, s: &str) -> fmt::Result {
+ let bytes = s.as_bytes();
+ let len = bytes.len();
+
+ // We do not want to overwrite our null terminator
+ if self.pos + len > self.buffer.len() - 1 {
+ return Err(fmt::Error);
+ }
+
+ // INVARIANT: We are not overwriting the last byte
+ self.buffer[self.pos..self.pos + len].copy_from_slice(bytes);
+
+ self.pos += len;
+
+ Ok(())
+ }
+}
diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs
new file mode 100644
index 000000000000..db5d760615d7
--- /dev/null
+++ b/rust/kernel/block/mq/request.rs
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! This module provides a wrapper for the C `struct request` type.
+//!
+//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h)
+
+use crate::{
+ bindings,
+ block::mq::Operations,
+ error::Result,
+ types::{ARef, AlwaysRefCounted, Opaque},
+};
+use core::{
+ marker::PhantomData,
+ ptr::{addr_of_mut, NonNull},
+ sync::atomic::{AtomicU64, Ordering},
+};
+
+/// A wrapper around a blk-mq `struct request`. This represents an IO request.
+///
+/// # Invariants
+///
+/// * `self.0` is a valid `struct request` created by the C portion of the kernel.
+/// * The private data area associated with this request must be an initialized
+/// and valid `RequestDataWrapper<T>`.
+/// * `self` is reference counted by atomic modification of
+/// self.wrapper_ref().refcount().
+///
+#[repr(transparent)]
+pub struct Request<T: Operations>(Opaque<bindings::request>, PhantomData<T>);
+
+impl<T: Operations> Request<T> {
+ /// Create an `ARef<Request>` from a `struct request` pointer.
+ ///
+ /// # Safety
+ ///
+ /// * The caller must own a refcount on `ptr` that is transferred to the
+ /// returned `ARef`.
+ /// * The type invariants for `Request` must hold for the pointee of `ptr`.
+ pub(crate) unsafe fn aref_from_raw(ptr: *mut bindings::request) -> ARef<Self> {
+ // INVARIANTS: By the safety requirements of this function, invariants are upheld.
+ // SAFETY: By the safety requirement of this function, we own a
+ // reference count that we can pass to `ARef`.
+ unsafe { ARef::from_raw(NonNull::new_unchecked(ptr as *const Self as *mut Self)) }
+ }
+
+ /// Notify the block layer that a request is going to be processed now.
+ ///
+ /// The block layer uses this hook to do proper initializations such as
+ /// starting the timeout timer. It is a requirement that block device
+ /// drivers call this function when starting to process a request.
+ ///
+ /// # Safety
+ ///
+ /// The caller must have exclusive ownership of `self`, that is
+ /// `self.wrapper_ref().refcount() == 2`.
+ pub(crate) unsafe fn start_unchecked(this: &ARef<Self>) {
+ // SAFETY: By type invariant, `self.0` is a valid `struct request`. By
+ // existence of `&mut self` we have exclusive access.
+ unsafe { bindings::blk_mq_start_request(this.0.get()) };
+ }
+
+ fn try_set_end(this: ARef<Self>) -> Result<ARef<Self>, ARef<Self>> {
+ // We can race with `TagSet::tag_to_rq`
+ match this.wrapper_ref().refcount().compare_exchange(
+ 2,
+ 0,
+ Ordering::Relaxed,
+ Ordering::Relaxed,
+ ) {
+ Err(_old) => Err(this),
+ Ok(_) => Ok(this),
+ }
+ }
+
+ /// Notify the block layer that the request has been completed without errors.
+ ///
+ /// This function will return `Err` if `this` is not the only `ARef`
+ /// referencing the request.
+ pub fn end_ok(this: ARef<Self>) -> Result<(), ARef<Self>> {
+ let this = Self::try_set_end(this)?;
+ let request_ptr = this.0.get();
+ core::mem::forget(this);
+
+ // SAFETY: By type invariant, `self.0` is a valid `struct request`. By
+ // existence of `&mut self` we have exclusive access.
+ unsafe { bindings::blk_mq_end_request(request_ptr, bindings::BLK_STS_OK as _) };
+
+ Ok(())
+ }
+
+ /// Return a pointer to the `RequestDataWrapper` stored in the private area
+ /// of the request structure.
+ ///
+ /// # Safety
+ ///
+ /// - `this` must point to a valid allocation.
+ pub(crate) unsafe fn wrapper_ptr(this: *mut Self) -> NonNull<RequestDataWrapper> {
+ let request_ptr = this.cast::<bindings::request>();
+ let wrapper_ptr =
+ // SAFETY: By safety requirements for this function, `this` is a
+ // valid allocation.
+ unsafe { bindings::blk_mq_rq_to_pdu(request_ptr).cast::<RequestDataWrapper>() };
+ // SAFETY: By C api contract, wrapper_ptr points to a valid allocation
+ // and is not null.
+ unsafe { NonNull::new_unchecked(wrapper_ptr) }
+ }
+
+ /// Return a reference to the `RequestDataWrapper` stored in the private
+ /// area of the request structure.
+ pub(crate) fn wrapper_ref(&self) -> &RequestDataWrapper {
+ // SAFETY: By type invariant, `self.0` is a valid alocation. Further,
+ // the private data associated with this request is initialized and
+ // valid. The existence of `&self` guarantees that the private data is
+ // valid as a shared reference.
+ unsafe { Self::wrapper_ptr(self as *const Self as *mut Self).as_ref() }
+ }
+}
+
+/// A wrapper around data stored in the private area of the C `struct request`.
+pub(crate) struct RequestDataWrapper {
+ /// The Rust request refcount has the following states:
+ ///
+ /// - 0: The request is owned by C block layer.
+ /// - 1: The request is owned by Rust abstractions but there are no ARef references to it.
+ /// - 2+: There are `ARef` references to the request.
+ refcount: AtomicU64,
+}
+
+impl RequestDataWrapper {
+ /// Return a reference to the refcount of the request that is embedding
+ /// `self`.
+ pub(crate) fn refcount(&self) -> &AtomicU64 {
+ &self.refcount
+ }
+
+ /// Return a pointer to the refcount of the request that is embedding the
+ /// pointee of `this`.
+ ///
+ /// # Safety
+ ///
+ /// - `this` must point to a live allocation of at least the size of `Self`.
+ pub(crate) unsafe fn refcount_ptr(this: *mut Self) -> *mut AtomicU64 {
+ // SAFETY: Because of the safety requirements of this function, the
+ // field projection is safe.
+ unsafe { addr_of_mut!((*this).refcount) }
+ }
+}
+
+// SAFETY: Exclusive access is thread-safe for `Request`. `Request` has no `&mut
+// self` methods and `&self` methods that mutate `self` are internally
+// synchronzied.
+unsafe impl<T: Operations> Send for Request<T> {}
+
+// SAFETY: Shared access is thread-safe for `Request`. `&self` methods that
+// mutate `self` are internally synchronized`
+unsafe impl<T: Operations> Sync for Request<T> {}
+
+/// Store the result of `op(target.load())` in target, returning new value of
+/// taret.
+fn atomic_relaxed_op_return(target: &AtomicU64, op: impl Fn(u64) -> u64) -> u64 {
+ let mut old = target.load(Ordering::Relaxed);
+ loop {
+ match target.compare_exchange_weak(old, op(old), Ordering::Relaxed, Ordering::Relaxed) {
+ Ok(_) => break,
+ Err(x) => {
+ old = x;
+ }
+ }
+ }
+
+ op(old)
+}
+
+/// Store the result of `op(target.load)` in `target` if `target.load() !=
+/// pred`, returning previous value of target
+fn atomic_relaxed_op_unless(target: &AtomicU64, op: impl Fn(u64) -> u64, pred: u64) -> bool {
+ let x = target.load(Ordering::Relaxed);
+ loop {
+ if x == pred {
+ break;
+ }
+ if target
+ .compare_exchange_weak(x, op(x), Ordering::Relaxed, Ordering::Relaxed)
+ .is_ok()
+ {
+ break;
+ }
+ }
+
+ x == pred
+}
+
+// SAFETY: All instances of `Request<T>` are reference counted. This
+// implementation of `AlwaysRefCounted` ensure that increments to the ref count
+// keeps the object alive in memory at least until a matching reference count
+// decrement is executed.
+unsafe impl<T: Operations> AlwaysRefCounted for Request<T> {
+ fn inc_ref(&self) {
+ let refcount = &self.wrapper_ref().refcount();
+
+ #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))]
+ let updated = atomic_relaxed_op_unless(refcount, |x| x + 1, 0);
+
+ #[cfg(CONFIG_DEBUG_MISC)]
+ if !updated {
+ panic!("Request refcount zero on clone")
+ }
+ }
+
+ unsafe fn dec_ref(obj: core::ptr::NonNull<Self>) {
+ // SAFETY: The type invariants of `ARef` guarantee that `obj` is valid
+ // for read.
+ let wrapper_ptr = unsafe { Self::wrapper_ptr(obj.as_ptr()).as_ptr() };
+ // SAFETY: The type invariant of `Request` guarantees that the private
+ // data area is initialized and valid.
+ let refcount = unsafe { &*RequestDataWrapper::refcount_ptr(wrapper_ptr) };
+
+ #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))]
+ let new_refcount = atomic_relaxed_op_return(refcount, |x| x - 1);
+
+ #[cfg(CONFIG_DEBUG_MISC)]
+ if new_refcount == 0 {
+ panic!("Request reached refcount zero in Rust abstractions");
+ }
+ }
+}
diff --git a/rust/kernel/block/mq/tag_set.rs b/rust/kernel/block/mq/tag_set.rs
new file mode 100644
index 000000000000..4217c2b03ff3
--- /dev/null
+++ b/rust/kernel/block/mq/tag_set.rs
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! This module provides the `TagSet` struct to wrap the C `struct blk_mq_tag_set`.
+//!
+//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h)
+
+use core::pin::Pin;
+
+use crate::{
+ bindings,
+ block::mq::request::RequestDataWrapper,
+ block::mq::{operations::OperationsVTable, Operations},
+ error::{self, Error, Result},
+ prelude::PinInit,
+ try_pin_init,
+ types::Opaque,
+};
+use core::{convert::TryInto, marker::PhantomData};
+use macros::{pin_data, pinned_drop};
+
+/// A wrapper for the C `struct blk_mq_tag_set`.
+///
+/// `struct blk_mq_tag_set` contains a `struct list_head` and so must be pinned.
+#[pin_data(PinnedDrop)]
+#[repr(transparent)]
+pub struct TagSet<T: Operations> {
+ #[pin]
+ inner: Opaque<bindings::blk_mq_tag_set>,
+ _p: PhantomData<T>,
+}
+
+impl<T: Operations> TagSet<T> {
+ /// Try to create a new tag set
+ pub fn try_new(
+ nr_hw_queues: u32,
+ num_tags: u32,
+ num_maps: u32,
+ ) -> impl PinInit<Self, error::Error> {
+ try_pin_init!( TagSet {
+ inner <- unsafe {kernel::init::pin_init_from_closure(move |place: *mut Opaque<bindings::blk_mq_tag_set>| -> Result<()> {
+ let place = place.cast::<bindings::blk_mq_tag_set>();
+
+ // SAFETY: try_ffi_init promises that `place` is writable, and
+ // zeroes is a valid bit pattern for this structure.
+ core::ptr::write_bytes(place, 0, 1);
+
+ /// For a raw pointer to a struct, write a struct field without
+ /// creating a reference to the field
+ macro_rules! write_ptr_field {
+ ($target:ident, $field:ident, $value:expr) => {
+ ::core::ptr::write(::core::ptr::addr_of_mut!((*$target).$field), $value)
+ };
+ }
+
+ // SAFETY: try_ffi_init promises that `place` is writable
+ write_ptr_field!(place, ops, OperationsVTable::<T>::build());
+ write_ptr_field!(place, nr_hw_queues , nr_hw_queues);
+ write_ptr_field!(place, timeout , 0); // 0 means default which is 30 * HZ in C
+ write_ptr_field!(place, numa_node , bindings::NUMA_NO_NODE);
+ write_ptr_field!(place, queue_depth , num_tags);
+ write_ptr_field!(place, cmd_size , core::mem::size_of::<RequestDataWrapper>().try_into()?);
+ write_ptr_field!(place, flags , bindings::BLK_MQ_F_SHOULD_MERGE);
+ write_ptr_field!(place, driver_data , core::ptr::null_mut::<core::ffi::c_void>());
+ write_ptr_field!(place, nr_maps , num_maps);
+
+ // SAFETY: Relevant fields of `place` are initialised above
+ let ret = bindings::blk_mq_alloc_tag_set(place);
+ if ret < 0 {
+ return Err(Error::from_errno(ret));
+ }
+
+ Ok(())
+ })},
+ _p: PhantomData,
+ })
+ }
+
+ /// Return the pointer to the wrapped `struct blk_mq_tag_set`
+ pub(crate) fn raw_tag_set(&self) -> *mut bindings::blk_mq_tag_set {
+ self.inner.get()
+ }
+}
+
+#[pinned_drop]
+impl<T: Operations> PinnedDrop for TagSet<T> {
+ fn drop(self: Pin<&mut Self>) {
+ // SAFETY: We are not moving self below
+ let this = unsafe { Pin::into_inner_unchecked(self) };
+
+ // SAFETY: `inner` is valid and has been properly initialised during construction.
+ unsafe { bindings::blk_mq_free_tag_set(this.inner.get()) };
+ }
+}
diff --git a/rust/kernel/error.rs b/rust/kernel/error.rs
index 55280ae9fe40..145f5c397009 100644
--- a/rust/kernel/error.rs
+++ b/rust/kernel/error.rs
@@ -126,6 +126,12 @@ pub fn to_errno(self) -> core::ffi::c_int {
self.0
}
+ #[cfg(CONFIG_BLOCK)]
+ pub(crate) fn to_blk_status(self) -> bindings::blk_status_t {
+ // SAFETY: `self.0` is a valid error due to its invariant.
+ unsafe { bindings::errno_to_blk_status(self.0) }
+ }
+
/// Returns the error encoded as a pointer.
#[allow(dead_code)]
pub(crate) fn to_ptr<T>(self) -> *mut T {
diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
index 9a943d99c71a..79be44281fb4 100644
--- a/rust/kernel/lib.rs
+++ b/rust/kernel/lib.rs
@@ -27,6 +27,8 @@
extern crate self as kernel;
pub mod alloc;
+#[cfg(CONFIG_BLOCK)]
+pub mod block;
mod build_assert;
pub mod error;
pub mod init;
--
2.44.0
From: Andreas Hindborg <[email protected]>
This patch adds an initial version of the Rust null block driver.
Signed-off-by: Andreas Hindborg <[email protected]>
---
drivers/block/Kconfig | 9 +++++
drivers/block/Makefile | 3 ++
drivers/block/rnull.rs | 86 +++++++++++++++++++++++++++++++++++++++++
rust/kernel/block/mq.rs | 4 +-
4 files changed, 101 insertions(+), 1 deletion(-)
create mode 100644 drivers/block/rnull.rs
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 5b9d4aaebb81..ed209f4f2798 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -354,6 +354,15 @@ config VIRTIO_BLK
This is the virtual block driver for virtio. It can be used with
QEMU based VMMs (like KVM or Xen). Say Y or M.
+config BLK_DEV_RUST_NULL
+ tristate "Rust null block driver (Experimental)"
+ depends on RUST
+ help
+ This is the Rust implementation of the null block driver. For now it
+ is only a minimal stub.
+
+ If unsure, say N.
+
config BLK_DEV_RBD
tristate "Rados block device (RBD)"
depends on INET && BLOCK
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 101612cba303..1105a2d4fdcb 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -9,6 +9,9 @@
# needed for trace events
ccflags-y += -I$(src)
+obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o
+rnull_mod-y := rnull.o
+
obj-$(CONFIG_MAC_FLOPPY) += swim3.o
obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o
obj-$(CONFIG_BLK_DEV_FD) += floppy.o
diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs
new file mode 100644
index 000000000000..1d6ab6f0f26f
--- /dev/null
+++ b/drivers/block/rnull.rs
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! This is a Rust implementation of the C null block driver.
+//!
+//! Supported features:
+//!
+//! - blk-mq interface
+//! - direct completion
+//! - block size 4k
+//!
+//! The driver is not configurable.
+
+use kernel::{
+ alloc::flags,
+ block::mq::{
+ self,
+ gen_disk::{self, GenDisk},
+ Operations, TagSet,
+ },
+ error::Result,
+ new_mutex, pr_info,
+ prelude::*,
+ sync::{Arc, Mutex},
+ types::ARef,
+};
+
+module! {
+ type: NullBlkModule,
+ name: "rnull_mod",
+ author: "Andreas Hindborg",
+ license: "GPL v2",
+}
+
+struct NullBlkModule {
+ _disk: Pin<Box<Mutex<GenDisk<NullBlkDevice, gen_disk::Added>>>>,
+}
+
+fn add_disk(tagset: Arc<TagSet<NullBlkDevice>>) -> Result<GenDisk<NullBlkDevice, gen_disk::Added>> {
+ let block_size: u16 = 4096;
+ if block_size % 512 != 0 || !(512..=4096).contains(&block_size) {
+ return Err(kernel::error::code::EINVAL);
+ }
+
+ let mut disk = gen_disk::try_new(tagset)?;
+ disk.set_name(format_args!("rnullb{}", 0))?;
+ disk.set_capacity_sectors(4096 << 11);
+ disk.set_queue_logical_block_size(block_size.into());
+ disk.set_queue_physical_block_size(block_size.into());
+ disk.set_rotational(false);
+ disk.add()
+}
+
+impl kernel::Module for NullBlkModule {
+ fn init(_module: &'static ThisModule) -> Result<Self> {
+ pr_info!("Rust null_blk loaded\n");
+ let tagset = Arc::pin_init(TagSet::try_new(1, 256, 1), flags::GFP_KERNEL)?;
+ let disk = Box::pin_init(
+ new_mutex!(add_disk(tagset)?, "nullb:disk"),
+ flags::GFP_KERNEL,
+ )?;
+
+ Ok(Self { _disk: disk })
+ }
+}
+
+struct NullBlkDevice;
+
+#[vtable]
+impl Operations for NullBlkDevice {
+ #[inline(always)]
+ fn queue_rq(rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result {
+ mq::Request::end_ok(rq)
+ .map_err(|_e| kernel::error::code::EIO)
+ .expect("Failed to complete request");
+
+ Ok(())
+ }
+
+ fn commit_rqs() {}
+
+ fn complete(rq: ARef<mq::Request<Self>>) {
+ mq::Request::end_ok(rq)
+ .map_err(|_e| kernel::error::code::EIO)
+ .expect("Failed to complete request")
+ }
+}
diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs
index efbd2588791b..54e032bbdffd 100644
--- a/rust/kernel/block/mq.rs
+++ b/rust/kernel/block/mq.rs
@@ -51,6 +51,7 @@
//!
//! ```rust
//! use kernel::{
+//! alloc::flags,
//! block::mq::*,
//! new_mutex,
//! prelude::*,
@@ -77,7 +78,8 @@
//! }
//! }
//!
-//! let tagset: Arc<TagSet<MyBlkDevice>> = Arc::pin_init(TagSet::try_new(1, 256, 1))?;
+//! let tagset: Arc<TagSet<MyBlkDevice>> =
+//! Arc::pin_init(TagSet::try_new(1, 256, 1), flags::GFP_KERNEL)?;
//! let mut disk = gen_disk::try_new(tagset)?;
//! disk.set_name(format_args!("myblk"))?;
//! disk.set_capacity_sectors(4096);
--
2.44.0
On 21.05.24 16:03, Andreas Hindborg wrote:
> From: Andreas Hindborg <[email protected]>
>
> This patch adds an initial version of the Rust null block driver.
>
> Signed-off-by: Andreas Hindborg <[email protected]>
> ---
> drivers/block/Kconfig | 9 +++++
> drivers/block/Makefile | 3 ++
> drivers/block/rnull.rs | 86 +++++++++++++++++++++++++++++++++++++++++
> rust/kernel/block/mq.rs | 4 +-
> 4 files changed, 101 insertions(+), 1 deletion(-)
> create mode 100644 drivers/block/rnull.rs
>
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index 5b9d4aaebb81..ed209f4f2798 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -354,6 +354,15 @@ config VIRTIO_BLK
> This is the virtual block driver for virtio. It can be used with
> QEMU based VMMs (like KVM or Xen). Say Y or M.
>
> +config BLK_DEV_RUST_NULL
> + tristate "Rust null block driver (Experimental)"
> + depends on RUST
> + help
> + This is the Rust implementation of the null block driver. For now it
> + is only a minimal stub.
> +
> + If unsure, say N.
> +
> config BLK_DEV_RBD
> tristate "Rados block device (RBD)"
> depends on INET && BLOCK
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index 101612cba303..1105a2d4fdcb 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -9,6 +9,9 @@
> # needed for trace events
> ccflags-y += -I$(src)
>
> +obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o
> +rnull_mod-y := rnull.o
> +
> obj-$(CONFIG_MAC_FLOPPY) += swim3.o
> obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o
> obj-$(CONFIG_BLK_DEV_FD) += floppy.o
> diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs
> new file mode 100644
> index 000000000000..1d6ab6f0f26f
> --- /dev/null
> +++ b/drivers/block/rnull.rs
> @@ -0,0 +1,86 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +//! This is a Rust implementation of the C null block driver.
> +//!
> +//! Supported features:
> +//!
> +//! - blk-mq interface
> +//! - direct completion
> +//! - block size 4k
> +//!
> +//! The driver is not configurable.
> +
> +use kernel::{
> + alloc::flags,
> + block::mq::{
> + self,
> + gen_disk::{self, GenDisk},
> + Operations, TagSet,
> + },
> + error::Result,
> + new_mutex, pr_info,
> + prelude::*,
> + sync::{Arc, Mutex},
> + types::ARef,
> +};
> +
> +module! {
> + type: NullBlkModule,
> + name: "rnull_mod",
> + author: "Andreas Hindborg",
> + license: "GPL v2",
> +}
> +
> +struct NullBlkModule {
> + _disk: Pin<Box<Mutex<GenDisk<NullBlkDevice, gen_disk::Added>>>>,
> +}
> +
> +fn add_disk(tagset: Arc<TagSet<NullBlkDevice>>) -> Result<GenDisk<NullBlkDevice, gen_disk::Added>> {
Any reason that this is its own function and not in the
`NullBlkModule::init` function?
> + let block_size: u16 = 4096;
> + if block_size % 512 != 0 || !(512..=4096).contains(&block_size) {
> + return Err(kernel::error::code::EINVAL);
> + }
> +
> + let mut disk = gen_disk::try_new(tagset)?;
> + disk.set_name(format_args!("rnullb{}", 0))?;
> + disk.set_capacity_sectors(4096 << 11);
> + disk.set_queue_logical_block_size(block_size.into());
> + disk.set_queue_physical_block_size(block_size.into());
> + disk.set_rotational(false);
> + disk.add()
> +}
> +
> +impl kernel::Module for NullBlkModule {
> + fn init(_module: &'static ThisModule) -> Result<Self> {
> + pr_info!("Rust null_blk loaded\n");
> + let tagset = Arc::pin_init(TagSet::try_new(1, 256, 1), flags::GFP_KERNEL)?;
> + let disk = Box::pin_init(
> + new_mutex!(add_disk(tagset)?, "nullb:disk"),
> + flags::GFP_KERNEL,
> + )?;
> +
> + Ok(Self { _disk: disk })
> + }
> +}
> +
> +struct NullBlkDevice;
> +
> +#[vtable]
> +impl Operations for NullBlkDevice {
> + #[inline(always)]
> + fn queue_rq(rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result {
> + mq::Request::end_ok(rq)
> + .map_err(|_e| kernel::error::code::EIO)
> + .expect("Failed to complete request");
This error would only happen if `rq` is not the only ARef to that
request, right?
> +
> + Ok(())
> + }
> +
> + fn commit_rqs() {}
> +
> + fn complete(rq: ARef<mq::Request<Self>>) {
Am I correct in thinking that this function is never actually called,
since all requests that are queued are immediately ended?
> + mq::Request::end_ok(rq)
> + .map_err(|_e| kernel::error::code::EIO)
> + .expect("Failed to complete request")
> + }
> +}
> diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs
> index efbd2588791b..54e032bbdffd 100644
> --- a/rust/kernel/block/mq.rs
> +++ b/rust/kernel/block/mq.rs
> @@ -51,6 +51,7 @@
> //!
> //! ```rust
> //! use kernel::{
> +//! alloc::flags,
> //! block::mq::*,
> //! new_mutex,
> //! prelude::*,
> @@ -77,7 +78,8 @@
> //! }
> //! }
> //!
> -//! let tagset: Arc<TagSet<MyBlkDevice>> = Arc::pin_init(TagSet::try_new(1, 256, 1))?;
> +//! let tagset: Arc<TagSet<MyBlkDevice>> =
> +//! Arc::pin_init(TagSet::try_new(1, 256, 1), flags::GFP_KERNEL)?;
This change should probably be in the patch before (seems like an
artifact from rebasing).
---
Cheers,
Benno
> //! let mut disk = gen_disk::try_new(tagset)?;
> //! disk.set_name(format_args!("myblk"))?;
> //! disk.set_capacity_sectors(4096);
> --
> 2.44.0
>
>
Benno Lossin <[email protected]> writes:
> On 21.05.24 16:03, Andreas Hindborg wrote:
>> From: Andreas Hindborg <[email protected]>
[...]
>> >> +
>> +//! This module provides types for implementing block drivers that interface the
>> +//! blk-mq subsystem.
>> +//!
>> +//! To implement a block device driver, a Rust module must do the following:
>> +//!
>> +//! - Implement [`Operations`] for a type `T`
>> +//! - Create a [`TagSet<T>`]
>> +//! - Create a [`gen_disk::GenDisk<T>`], passing in the `TagSet` reference
>
> I would use short links [`GenDisk<T>`].
`GenDisk` is not in scope, so short link is not working. But I can do
whatever this is called:
//! - Create a [`GenDisk<T>`], passing in the `TagSet` reference
..
//! [`GenDisk<T>`]: gen_disk::GenDisk
Would you prefer that?
>
>> +//! - Add the disk to the system by calling [`gen_disk::GenDisk::add`]
>> +//!
>> +//! The types available in this module that have direct C counterparts are:
>> +//!
>> +//! - The `TagSet` type that abstracts the C type `struct tag_set`.
>
> Missing link? (also below)
`TagSet` was linked above. Would you link it on each mention?
[...]
>> +//!
>> +//! fn commit_rqs(
>> +//! ) {
>> +//! }
>
> Formatting.
I would love if `rustfmt` would do this. But I think it is both unstable
and broken for examples like this [1]. I'll fix it up by hand.
How do you manage formatting in examples? By hand?
[...]
>> +/// A generic block device.
>> +///
>> +/// # Invariants
>> +///
>> +/// - `gendisk` must always point to an initialized and valid `struct gendisk`.
>> +pub struct GenDisk<T: Operations, S: GenDiskState> {
>
> Does it make sense to do `S: GenDiskState = Added`?
Yes, that would make sense.
>
>> + _tagset: Arc<TagSet<T>>,
>
> Does this really need a leading underscore?
It does not.
>
>> + gendisk: *mut bindings::gendisk,
>> + _phantom: core::marker::PhantomData<S>,
>> +}
>> +
>> +// SAFETY: `GenDisk` is an owned pointer to a `struct gendisk` and an `Arc` to a
>> +// `TagSet` It is safe to send this to other threads as long as T is Send.
>> +unsafe impl<T: Operations + Send, S: GenDiskState> Send for GenDisk<T, S> {}
>> +
>> +/// Disks in this state are allocated and initialized, but are not yet
>> +/// accessible from the kernel VFS.
>> +pub enum Initialized {}
>> +
>> +/// Disks in this state have been attached to the kernel VFS and may receive IO
>> +/// requests.
>> +pub enum Added {}
>> +
>> +/// Typestate representing states of a `GenDisk`.
>> +pub trait GenDiskState {}
>
> Should this trait be sealed?
Yes.
>
>> +
>> +impl GenDiskState for Initialized {}
>> +impl GenDiskState for Added {}
>> +
>> +impl<T: Operations> GenDisk<T, Initialized> {
>> + /// Register the device with the kernel. When this function returns, the
>> + /// device is accessible from VFS. The kernel may issue reads to the device
>> + /// during registration to discover partition information.
>> + pub fn add(self) -> Result<GenDisk<T, Added>> {
>> + crate::error::to_result(
>> + // SAFETY: By type invariant, `self.gendisk` points to a valid and
>> + // initialized instance of `struct gendisk`
>> + unsafe {
>> + bindings::device_add_disk(
>> + core::ptr::null_mut(),
>> + self.gendisk,
>> + core::ptr::null_mut(),
>> + )
>> + },
>> + )?;
>> +
>> + // We don't want to run the destuctor and remove the device from the VFS
>> + // when `disk` is dropped.
>> + let mut old = core::mem::ManuallyDrop::new(self);
>> +
>> + let new = GenDisk {
>> + _tagset: old._tagset.clone(),
>> + gendisk: old.gendisk,
>> + _phantom: PhantomData,
>> + };
>> +
>> + // But we have to drop the `Arc` or it will leak.
>> + // SAFETY: `old._tagset` is valid for write, aligned, non-null, and we
>> + // have exclusive access. We are not accessing the value again after it
>> + // is dropped.
>> + unsafe { core::ptr::drop_in_place(&mut old._tagset) };
>> +
>> + Ok(new)
>
> Instead of using `ManuallyDrop` and `drop_in_place` why not use
> `transmute`? I feel like that would be a lot simpler.
I was considering the layout not being deterministic for `repr(Rust)`. I
think that in practice it will be the case that the two types will have
the same layout, but I could not find the documentation that states
this. Nomicon does not talk about zero sized types [2].
[...]
>> +impl<T: Operations, S: GenDiskState> GenDisk<T, S> {
>> + /// Call to tell the block layer the capacity of the device in sectors (512
>> + /// bytes).
>> + pub fn set_capacity_sectors(&self, sectors: u64) {
>> + // SAFETY: By type invariant, `self.gendisk` points to a valid and
>> + // initialized instance of `struct gendisk`. Callee takes a lock to
>
> By `Callee`, do you mean `set_capacity`? If so, I would prefer that.
Yes. I will update.
[...]
>> +impl<T: Operations, S: GenDiskState> Drop for GenDisk<T, S> {
>> + fn drop(&mut self) {
>> + // TODO: This will `WARN` if the disk was not added. Since we cannot
>> + // specialize drop, we have to call it, or track state with a flag.
>
> You could add an associated constant to GenDiskState and then write the
> following:
>
> if <S as GenDiskState>::DELETE_ON_DROP {
> /* del_gendisk ... */
> }
>
> Then the check is essentially done at compile-time.
Thanks, this is very useful. I will add it.
>
>> +
>> + // SAFETY: By type invariant, `self.gendisk` points to a valid and
>> + // initialized instance of `struct gendisk`
>> + unsafe { bindings::del_gendisk(self.gendisk) };
>> + }
>> +}
>> +
>> +/// Try to create a new `GenDisk`.
>> +pub fn try_new<T: Operations>(tagset: Arc<TagSet<T>>) -> Result<GenDisk<T, Initialized>> {
>
> Why is this not inside of an `impl` block of `GenDisk<T, Initialized>`?
No particular reason. I should probably move it.
[...]
>> diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs
>> new file mode 100644
>> index 000000000000..3bd1af2c2260
>> --- /dev/null
>> +++ b/rust/kernel/block/mq/operations.rs
>> @@ -0,0 +1,245 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +//! This module provides an interface for blk-mq drivers to implement.
>> +//!
>> +//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h)
>> +
>> +use crate::{
>> + bindings,
>> + block::mq::request::RequestDataWrapper,
>> + block::mq::Request,
>> + error::{from_result, Result},
>> + types::ARef,
>> +};
>> +use core::{marker::PhantomData, sync::atomic::AtomicU64, sync::atomic::Ordering};
>> +
>> +/// Implement this trait to interface blk-mq as block devices.
>> +///
>> +/// To implement a block device driver, implement this trait as described in the
>> +/// [module level documentation]. The kernel will use the implementation of the
>> +/// functions defined in this trait to interface a block device driver. Note:
>> +/// There is no need for an exit_request() implementation, because the `drop`
>> +/// implementation of the [`Request`] type will be invoked by automatically by
>> +/// the C/Rust glue logic.
>
> This text is wrapped to 80 columns, but our usual wrapping is 100.
This had me scratch my head for a bit. I run `make rustfmt` and `make
rustfmtcheck`, so I had no idea why my code would be formatted
incorrect. It took me a while to figure out that we are not enabling
`comment_width = 100`, presumably because it is an unstable `rustfmt`
feature. I am not sure what the correct way to enable it but I hacked
the Makefile and enabled it. It gives a huge diff all across the kernel
crate.
So, it seems we _are_ in fact using 80 line fill column, since that is
what much of our existing code is using, and that is what the build
system is configured to use.
Where did you come across the 100 character fill column?
Anyways, we should configure our tools to the standard we want. I don't
care if it's 80 or 100, as long as I can have the tools do the job for
me.
Let's discuss this at next meeting.
>
>> +///
>> +/// [module level documentation]: kernel::block::mq
>> +#[macros::vtable]
>> +pub trait Operations: Sized {
>> + /// Called by the kernel to queue a request with the driver. If `is_last` is
>> + /// `false`, the driver is allowed to defer committing the request.
>> + fn queue_rq(rq: ARef<Request<Self>>, is_last: bool) -> Result;
>> +
>> + /// Called by the kernel to indicate that queued requests should be submitted.
>> + fn commit_rqs();
>> +
>> + /// Called by the kernel when the request is completed.
>> + fn complete(_rq: ARef<Request<Self>>);
>
> Is there a reason for the `_`?
Copy pasta probably. Will remove.
>
> To me it seems this is called when the given request is already done, so
> would it make more sense to name it `completed` or `on_completion`?
I would agree. But we had a bit of discussion at LSF about naming things
differently in Rust vs C. C people prefer if we keep the C names, even
if they do not make sense to the people who write the new Rust code.
In C, the vtable entry is called `complete_callback` and the called
symbol is usually `my_driver_complete_rq`.
We could go with `completed`, `completed_callback`, or `complete_rq`.
Although `completed` sounds like it should return a bool indicating
whether the request was already completed.
I think I'll leave it for now, and we can always change it if we come up
with a really good name.
>
>> +
>> + /// Called by the kernel to poll the device for completed requests. Only
>> + /// used for poll queues.
>> + fn poll() -> bool {
>> + crate::build_error(crate::error::VTABLE_DEFAULT_ERROR)
>> + }
>> +}
>> +
>> +/// A vtable for blk-mq to interact with a block device driver.
>> +///
>> +/// A `bindings::blk_mq_opa` vtable is constructed from pointers to the `extern
>> +/// "C"` functions of this struct, exposed through the `OperationsVTable::VTABLE`.
>> +///
>> +/// For general documentation of these methods, see the kernel source
>> +/// documentation related to `struct blk_mq_operations` in
>> +/// [`include/linux/blk-mq.h`].
>> +///
>> +/// [`include/linux/blk-mq.h`]: srctree/include/linux/blk-mq.h
>> +pub(crate) struct OperationsVTable<T: Operations>(PhantomData<T>);
>> +
>> +impl<T: Operations> OperationsVTable<T> {
>> + /// This function is called by the C kernel. A pointer to this function is
>> + /// installed in the `blk_mq_ops` vtable for the driver.
>> + ///
>> + /// # Safety
>> + ///
>> + /// - The caller of this function must ensure `bd` is valid
>> + /// and initialized. The pointees must outlive this function.
>> + /// - This function must not be called with a `hctx` for which
>> + /// `Self::exit_hctx_callback()` has been called.
>> + /// - (*bd).rq must point to a valid `bindings:request` for which
>> + /// `OperationsVTable<T>::init_request_callback` was called
>> + unsafe extern "C" fn queue_rq_callback(
>> + _hctx: *mut bindings::blk_mq_hw_ctx,
>> + bd: *const bindings::blk_mq_queue_data,
>> + ) -> bindings::blk_status_t {
>> + // SAFETY: `bd.rq` is valid as required by the safety requirement for
>> + // this function.
>> + let request = unsafe { &*(*bd).rq.cast::<Request<T>>() };
>> +
>> + // One refcount for the ARef, one for being in flight
>> + request.wrapper_ref().refcount().store(2, Ordering::Relaxed);
>> +
>> + let rq =
>> + // SAFETY: We own a refcount that we took above. We pass that to `ARef`.
>> + // By the safety requirements of this function, `request` is a valid
>> + // `struct request` and the private data is properly initialized.
>> + unsafe {Request::aref_from_raw((*bd).rq)};
>
> Can you put the SAFETY comment above the line, then the formatting is
> more natural.
Yes.
>
>> +
>> + // SAFETY: We have exclusive access and we just set the refcount above.
>> + unsafe { Request::start_unchecked(&rq) };
>> +
>> + let ret = T::queue_rq(
>> + rq,
>> + // SAFETY: `bd` is valid as required by the safety requirement for this function.
>> + unsafe { (*bd).last },
>> + );
>> +
>> + if let Err(e) = ret {
>> + e.to_blk_status()
>> + } else {
>> + bindings::BLK_STS_OK as _
>> + }
>> + }
>> +
>> + /// This function is called by the C kernel. A pointer to this function is
>> + /// installed in the `blk_mq_ops` vtable for the driver.
>> + ///
>> + /// # Safety
>> + ///
>> + /// This function may only be called by blk-mq C infrastructure.
>> + unsafe extern "C" fn commit_rqs_callback(_hctx: *mut bindings::blk_mq_hw_ctx) {
>> + T::commit_rqs()
>> + }
>> +
>> + /// This function is called by the C kernel. A pointer to this function is
>> + /// installed in the `blk_mq_ops` vtable for the driver.
>> + ///
>> + /// # Safety
>> + ///
>> + /// This function may only be called by blk-mq C infrastructure. `rq` must
>> + /// point to a valid request that has been marked as completed. The pointee
>> + /// of `rq` must be valid for write for the duration of this function.
>> + unsafe extern "C" fn complete_callback(rq: *mut bindings::request) {
>> + // SAFETY: This function can only be dispatched through
>> + // `Request::complete`. We leaked a refcount then which we pick back up
>> + // now.
>
> What do you mean by the first sentence? Where was this refcount leaked?
I can understand why you are confused. `Request::complete` is in my
outgoing patch queue. There is no way to call this function in this
patch. I will move it to a later patch. Thanks.
[...]
>> diff --git a/rust/kernel/block/mq/raw_writer.rs b/rust/kernel/block/mq/raw_writer.rs
>> new file mode 100644
>> index 000000000000..4f7e4692b592
>> --- /dev/null
>> +++ b/rust/kernel/block/mq/raw_writer.rs
>> @@ -0,0 +1,55 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +use core::fmt::{self, Write};
>> +
>> +use crate::error::Result;
>> +use crate::prelude::EINVAL;
>> +
>> +/// A mutable reference to a byte buffer where a string can be written into.
>> +///
>> +/// # Invariants
>> +///
>> +/// `buffer` is always null terminated.
>
> I don't know how valuable this would be, but you could only ask for this
> invariant, if `buffer` is non-empty. Then you could have the `new` and
> `from_array` functions return a `RawWriter` without result.
I think guaranteeing at least a null character is always written by
`write_str` is a good thing. It is used for writing C strings to device
name fields. `write_str` with a zero size buffer would give undesirable
results, and is probably not what the caller wants.
>
>> +pub(crate) struct RawWriter<'a> {
>> + buffer: &'a mut [u8],
>> + pos: usize,
>> +}
>> +
>> +impl<'a> RawWriter<'a> {
>> + /// Create a new `RawWriter` instance.
>> + fn new(buffer: &'a mut [u8]) -> Result<RawWriter<'a>> {
>> + *(buffer.last_mut().ok_or(EINVAL)?) = 0;
>> +
>> + // INVARIANT: We null terminated the buffer above
>
> Missing `.`.
????
[...]
>> +impl<T: Operations> Request<T> {
>> + /// Create an `ARef<Request>` from a `struct request` pointer.
>> + ///
>> + /// # Safety
>> + ///
>> + /// * The caller must own a refcount on `ptr` that is transferred to the
>> + /// returned `ARef`.
>> + /// * The type invariants for `Request` must hold for the pointee of `ptr`.
>> + pub(crate) unsafe fn aref_from_raw(ptr: *mut bindings::request) -> ARef<Self> {
>> + // INVARIANTS: By the safety requirements of this function, invariants are upheld.
>
> Typo: INVARIANTS -> INVARIANT
????
>
>> + // SAFETY: By the safety requirement of this function, we own a
>> + // reference count that we can pass to `ARef`.
>> + unsafe { ARef::from_raw(NonNull::new_unchecked(ptr as *const Self as *mut Self)) }
>> + }
>> +
>> + /// Notify the block layer that a request is going to be processed now.
>> + ///
>> + /// The block layer uses this hook to do proper initializations such as
>> + /// starting the timeout timer. It is a requirement that block device
>> + /// drivers call this function when starting to process a request.
>> + ///
>> + /// # Safety
>> + ///
>> + /// The caller must have exclusive ownership of `self`, that is
>> + /// `self.wrapper_ref().refcount() == 2`.
>> + pub(crate) unsafe fn start_unchecked(this: &ARef<Self>) {
>> + // SAFETY: By type invariant, `self.0` is a valid `struct request`. By
>> + // existence of `&mut self` we have exclusive access.
>> + unsafe { bindings::blk_mq_start_request(this.0.get()) };
>> + }
>> +
>> + fn try_set_end(this: ARef<Self>) -> Result<ARef<Self>, ARef<Self>> {
>> + // We can race with `TagSet::tag_to_rq`
>> + match this.wrapper_ref().refcount().compare_exchange(
>> + 2,
>> + 0,
>> + Ordering::Relaxed,
>> + Ordering::Relaxed,
>> + ) {
>> + Err(_old) => Err(this),
>> + Ok(_) => Ok(this),
>> + }
>> + }
>> +
>> + /// Notify the block layer that the request has been completed without errors.
>> + ///
>> + /// This function will return `Err` if `this` is not the only `ARef`
>> + /// referencing the request.
>> + pub fn end_ok(this: ARef<Self>) -> Result<(), ARef<Self>> {
>
> I am not yet fully convinced that this is the way we should go. I think
> I would have to see a more complex usage of `Request` with that id <->
> request mapping that you mentioned. Because with how rnull uses this
> API, it could also have a `URef<Self>` parameter (URef := unique ARef).
I considered a `UniqueARef` but it would just move the error handing to
`ARef::into_unique` and then make `end_ok` infallible.
There are four states for a request that we need to track:
A) Request is owned by block layer (refcount 0)
B) Request is owned by driver but with zero `ARef`s in existence
(refcount 1)
C) Request is owned by driver with exactly one `ARef` in existence
(refcount 2)
D) Request is owned by driver with more than one `ARef` in existence
(refcount > 2)
It is in the doc comments for `RequestDataWrapper` as well.
We need A and B to ensure we fail tag to request conversions for
requests that are not owned by the driver.
We need C and D to ensure that it is safe to end the request and hand back
ownership to the block layer.
I will ping you when I hook up the NVMe driver with this.
>
>> + let this = Self::try_set_end(this)?;
>> + let request_ptr = this.0.get();
>> + core::mem::forget(this);
>
> Would be a good idea to mention who is going to own this refcount.
The refcount is zero after `try_set_end`, so there is no owner of the
count. The request will be in state A and thus block layer owns the
request. Block layer does not honor this refcount, it is only for the
driver to know.
Perhaps I should move the explanation up into the docs for `Request`.
>
>> +
>> + // SAFETY: By type invariant, `self.0` is a valid `struct request`. By
>> + // existence of `&mut self` we have exclusive access.
>> + unsafe { bindings::blk_mq_end_request(request_ptr, bindings::BLK_STS_OK as _) };
>> +
>> + Ok(())
>> + }
>> +
>> + /// Return a pointer to the `RequestDataWrapper` stored in the private area
>> + /// of the request structure.
>> + ///
>> + /// # Safety
>> + ///
>> + /// - `this` must point to a valid allocation.
>> + pub(crate) unsafe fn wrapper_ptr(this: *mut Self) -> NonNull<RequestDataWrapper> {
>> + let request_ptr = this.cast::<bindings::request>();
>> + let wrapper_ptr =
>> + // SAFETY: By safety requirements for this function, `this` is a
>> + // valid allocation.
>> + unsafe { bindings::blk_mq_rq_to_pdu(request_ptr).cast::<RequestDataWrapper>() };
>> + // SAFETY: By C api contract, wrapper_ptr points to a valid allocation
>> + // and is not null.
>> + unsafe { NonNull::new_unchecked(wrapper_ptr) }
>> + }
>> +
>> + /// Return a reference to the `RequestDataWrapper` stored in the private
>> + /// area of the request structure.
>> + pub(crate) fn wrapper_ref(&self) -> &RequestDataWrapper {
>> + // SAFETY: By type invariant, `self.0` is a valid alocation. Further,
>> + // the private data associated with this request is initialized and
>> + // valid. The existence of `&self` guarantees that the private data is
>> + // valid as a shared reference.
>> + unsafe { Self::wrapper_ptr(self as *const Self as *mut Self).as_ref() }
>> + }
>> +}
>> +
>> +/// A wrapper around data stored in the private area of the C `struct request`.
>> +pub(crate) struct RequestDataWrapper {
>
> Why is this called `Wrapper`? It doesn't really wrap anything,
> `RequestData` seems fine.
It will eventually wrap private data associated with the request. Those
patches will be submitted later. Should I change the name in the
meanwhile?
>
>> + /// The Rust request refcount has the following states:
>> + ///
>> + /// - 0: The request is owned by C block layer.
>> + /// - 1: The request is owned by Rust abstractions but there are no ARef references to it.
>> + /// - 2+: There are `ARef` references to the request.
>> + refcount: AtomicU64,
>> +}
>> +
>> +impl RequestDataWrapper {
>> + /// Return a reference to the refcount of the request that is embedding
>> + /// `self`.
>> + pub(crate) fn refcount(&self) -> &AtomicU64 {
>> + &self.refcount
>> + }
>> +
>> + /// Return a pointer to the refcount of the request that is embedding the
>> + /// pointee of `this`.
>> + ///
>> + /// # Safety
>> + ///
>> + /// - `this` must point to a live allocation of at least the size of `Self`.
>> + pub(crate) unsafe fn refcount_ptr(this: *mut Self) -> *mut AtomicU64 {
>> + // SAFETY: Because of the safety requirements of this function, the
>> + // field projection is safe.
>> + unsafe { addr_of_mut!((*this).refcount) }
>> + }
>> +}
>> +
>> +// SAFETY: Exclusive access is thread-safe for `Request`. `Request` has no `&mut
>> +// self` methods and `&self` methods that mutate `self` are internally
>> +// synchronzied.
>> +unsafe impl<T: Operations> Send for Request<T> {}
>> +
>> +// SAFETY: Shared access is thread-safe for `Request`. `&self` methods that
>> +// mutate `self` are internally synchronized`
>> +unsafe impl<T: Operations> Sync for Request<T> {}
>> +
>> +/// Store the result of `op(target.load())` in target, returning new value of
>> +/// taret.
>> +fn atomic_relaxed_op_return(target: &AtomicU64, op: impl Fn(u64) -> u64) -> u64 {
>> + let mut old = target.load(Ordering::Relaxed);
>> + loop {
>> + match target.compare_exchange_weak(old, op(old), Ordering::Relaxed, Ordering::Relaxed) {
>> + Ok(_) => break,
>> + Err(x) => {
>> + old = x;
>> + }
>> + }
>> + }
>
> This seems like a reimplementation of `AtomicU64::fetch_update` to me.
It looks like it! Except this function is returning the updated value,
`fetch_update` is returning the old value.
Would you rather that I rewrite in terms of the library function?
>
>> +
>> + op(old)
>> +}
>> +
>> +/// Store the result of `op(target.load)` in `target` if `target.load() !=
>> +/// pred`, returning previous value of target
>> +fn atomic_relaxed_op_unless(target: &AtomicU64, op: impl Fn(u64) -> u64, pred: u64) -> bool {
>> + let x = target.load(Ordering::Relaxed);
>> + loop {
>> + if x == pred {
>> + break;
>> + }
>> + if target
>> + .compare_exchange_weak(x, op(x), Ordering::Relaxed, Ordering::Relaxed)
>> + .is_ok()
>> + {
>> + break;
>> + }
>> + }
>> +
>> + x == pred
>> +}
>> +
>> +// SAFETY: All instances of `Request<T>` are reference counted. This
>> +// implementation of `AlwaysRefCounted` ensure that increments to the ref count
>> +// keeps the object alive in memory at least until a matching reference count
>> +// decrement is executed.
>> +unsafe impl<T: Operations> AlwaysRefCounted for Request<T> {
>> + fn inc_ref(&self) {
>> + let refcount = &self.wrapper_ref().refcount();
>> +
>> + #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))]
>
> Another option would be to use `_updated` as the name of the variable. I
> personally would prefer that. What do you guys think?
Either way is fine by me.
[...]
>> diff --git a/rust/kernel/block/mq/tag_set.rs b/rust/kernel/block/mq/tag_set.rs
>> new file mode 100644
>> index 000000000000..4217c2b03ff3
>> --- /dev/null
>> +++ b/rust/kernel/block/mq/tag_set.rs
>> @@ -0,0 +1,93 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +//! This module provides the `TagSet` struct to wrap the C `struct blk_mq_tag_set`.
>> +//!
>> +//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h)
>> +
>> +use core::pin::Pin;
>> +
>> +use crate::{
>> + bindings,
>> + block::mq::request::RequestDataWrapper,
>> + block::mq::{operations::OperationsVTable, Operations},
>> + error::{self, Error, Result},
>> + prelude::PinInit,
>> + try_pin_init,
>> + types::Opaque,
>> +};
>> +use core::{convert::TryInto, marker::PhantomData};
>> +use macros::{pin_data, pinned_drop};
>> +
>> +/// A wrapper for the C `struct blk_mq_tag_set`.
>> +///
>> +/// `struct blk_mq_tag_set` contains a `struct list_head` and so must be pinned.
>> +#[pin_data(PinnedDrop)]
>> +#[repr(transparent)]
>> +pub struct TagSet<T: Operations> {
>> + #[pin]
>> + inner: Opaque<bindings::blk_mq_tag_set>,
>> + _p: PhantomData<T>,
>> +}
>> +
>> +impl<T: Operations> TagSet<T> {
>> + /// Try to create a new tag set
>> + pub fn try_new(
>> + nr_hw_queues: u32,
>> + num_tags: u32,
>> + num_maps: u32,
>> + ) -> impl PinInit<Self, error::Error> {
>> + try_pin_init!( TagSet {
>> + inner <- unsafe {kernel::init::pin_init_from_closure(move |place: *mut Opaque<bindings::blk_mq_tag_set>| -> Result<()> {
>
> We currently do not have `Opaque::try_ffi_init`, I vaguely remember that
> we talked about it. Do you mind adding it? Otherwise I can also send a
> patch.
I have a `try_ffi_init` patch queued. I removed it from here to cut
dependencies. I will submit it soon after this is in.
>
>> + let place = place.cast::<bindings::blk_mq_tag_set>();
>> +
>> + // SAFETY: try_ffi_init promises that `place` is writable, and
>> + // zeroes is a valid bit pattern for this structure.
>> + core::ptr::write_bytes(place, 0, 1);
>> +
>> + /// For a raw pointer to a struct, write a struct field without
>> + /// creating a reference to the field
>> + macro_rules! write_ptr_field {
>> + ($target:ident, $field:ident, $value:expr) => {
>> + ::core::ptr::write(::core::ptr::addr_of_mut!((*$target).$field), $value)
>> + };
>> + }
>> +
>> + // SAFETY: try_ffi_init promises that `place` is writable
>> + write_ptr_field!(place, ops, OperationsVTable::<T>::build());
>> + write_ptr_field!(place, nr_hw_queues , nr_hw_queues);
>> + write_ptr_field!(place, timeout , 0); // 0 means default which is 30 * HZ in C
>> + write_ptr_field!(place, numa_node , bindings::NUMA_NO_NODE);
>> + write_ptr_field!(place, queue_depth , num_tags);
>> + write_ptr_field!(place, cmd_size , core::mem::size_of::<RequestDataWrapper>().try_into()?);
>> + write_ptr_field!(place, flags , bindings::BLK_MQ_F_SHOULD_MERGE);
>> + write_ptr_field!(place, driver_data , core::ptr::null_mut::<core::ffi::c_void>());
>> + write_ptr_field!(place, nr_maps , num_maps);
>
> I think that there is some way for pinned-init to do a better job here.
> I feel like we ought to be able to just write:
>
> Opaque::init(
> try_init!(bindings::blk_mq_tag_set {
> ops: OperationsVTable::<T>::build(),
> nr_hw_queues,
> timeout: 0, // 0 means default, which is 30Hz
> numa_node: bindings::NUMA_NO_NODE,
> queue_depth: num_tags,
> cmd_size: size_of::<RequestDataWrapper>().try_into()?,
> flags: bindings::BLK_MQ_F_SHOULD_MERGE,
> driver_data: null_mut(),
> nr_maps: num_maps,
> ..Zeroable::zeroed()
> }? Error)
> .chain(|tag_set| to_result(bindings::blk_mq_alloc_tag_set(tag_set)))
> )
>
> But we don't have `Opaque::init` (shouldn't be difficult) and
> `bindings::blk_mq_tag_set` doesn't implement `Zeroable`. We would need
> bindgen to put `derive(Zeroable)` on certain structs...
>
> Another option would be to just list the fields explicitly, since there
> aren't that many. What do you think?
Both options sound good. Ofc the first one sounds more user friendly
while the latter one sounds easier to implement. Getting rid of the
unsafe blocks here would be really nice.
>
>> +
>> + // SAFETY: Relevant fields of `place` are initialised above
>> + let ret = bindings::blk_mq_alloc_tag_set(place);
>> + if ret < 0 {
>> + return Err(Error::from_errno(ret));
>> + }
>> +
>> + Ok(())
>> + })},
>> + _p: PhantomData,
>> + })
>> + }
>> +
>> + /// Return the pointer to the wrapped `struct blk_mq_tag_set`
>> + pub(crate) fn raw_tag_set(&self) -> *mut bindings::blk_mq_tag_set {
>> + self.inner.get()
>> + }
>> +}
>> +
>> +#[pinned_drop]
>> +impl<T: Operations> PinnedDrop for TagSet<T> {
>> + fn drop(self: Pin<&mut Self>) {
>> + // SAFETY: We are not moving self below
>> + let this = unsafe { Pin::into_inner_unchecked(self) };
>
> You don't need to unwrap the `Pin`, since you only need access to `&Self`
> and `Pin` always allows you to do that. (so just use `self` instead of
> `this` below)
Thanks ????
>
>> +
>> + // SAFETY: `inner` is valid and has been properly initialised during construction.
>
> Should be an invariant.
Ok ????
Thanks for the review! I will send a new version.
Best regards,
Andreas
[1] https://github.com/rust-lang/rustfmt/issues/3348
[2] https://doc.rust-lang.org/nomicon/repr-rust.html#reprrust
Benno Lossin <[email protected]> writes:
> On 21.05.24 16:03, Andreas Hindborg wrote:
>> From: Andreas Hindborg <[email protected]>
>>
>> This patch adds an initial version of the Rust null block driver.
>>
>> Signed-off-by: Andreas Hindborg <[email protected]>
>> ---
>> drivers/block/Kconfig | 9 +++++
>> drivers/block/Makefile | 3 ++
>> drivers/block/rnull.rs | 86 +++++++++++++++++++++++++++++++++++++++++
>> rust/kernel/block/mq.rs | 4 +-
>> 4 files changed, 101 insertions(+), 1 deletion(-)
>> create mode 100644 drivers/block/rnull.rs
>>
>> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
>> index 5b9d4aaebb81..ed209f4f2798 100644
>> --- a/drivers/block/Kconfig
>> +++ b/drivers/block/Kconfig
>> @@ -354,6 +354,15 @@ config VIRTIO_BLK
>> This is the virtual block driver for virtio. It can be used with
>> QEMU based VMMs (like KVM or Xen). Say Y or M.
>>
>> +config BLK_DEV_RUST_NULL
>> + tristate "Rust null block driver (Experimental)"
>> + depends on RUST
>> + help
>> + This is the Rust implementation of the null block driver. For now it
>> + is only a minimal stub.
>> +
>> + If unsure, say N.
>> +
>> config BLK_DEV_RBD
>> tristate "Rados block device (RBD)"
>> depends on INET && BLOCK
>> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
>> index 101612cba303..1105a2d4fdcb 100644
>> --- a/drivers/block/Makefile
>> +++ b/drivers/block/Makefile
>> @@ -9,6 +9,9 @@
>> # needed for trace events
>> ccflags-y += -I$(src)
>>
>> +obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o
>> +rnull_mod-y := rnull.o
>> +
>> obj-$(CONFIG_MAC_FLOPPY) += swim3.o
>> obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o
>> obj-$(CONFIG_BLK_DEV_FD) += floppy.o
>> diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs
>> new file mode 100644
>> index 000000000000..1d6ab6f0f26f
>> --- /dev/null
>> +++ b/drivers/block/rnull.rs
>> @@ -0,0 +1,86 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +//! This is a Rust implementation of the C null block driver.
>> +//!
>> +//! Supported features:
>> +//!
>> +//! - blk-mq interface
>> +//! - direct completion
>> +//! - block size 4k
>> +//!
>> +//! The driver is not configurable.
>> +
>> +use kernel::{
>> + alloc::flags,
>> + block::mq::{
>> + self,
>> + gen_disk::{self, GenDisk},
>> + Operations, TagSet,
>> + },
>> + error::Result,
>> + new_mutex, pr_info,
>> + prelude::*,
>> + sync::{Arc, Mutex},
>> + types::ARef,
>> +};
>> +
>> +module! {
>> + type: NullBlkModule,
>> + name: "rnull_mod",
>> + author: "Andreas Hindborg",
>> + license: "GPL v2",
>> +}
>> +
>> +struct NullBlkModule {
>> + _disk: Pin<Box<Mutex<GenDisk<NullBlkDevice, gen_disk::Added>>>>,
>> +}
>> +
>> +fn add_disk(tagset: Arc<TagSet<NullBlkDevice>>) -> Result<GenDisk<NullBlkDevice, gen_disk::Added>> {
>
> Any reason that this is its own function and not in the
> `NullBlkModule::init` function?
Would you embed it inside the `init` function? To what end? I don't
think that would read well.
>
>> + let block_size: u16 = 4096;
>> + if block_size % 512 != 0 || !(512..=4096).contains(&block_size) {
>> + return Err(kernel::error::code::EINVAL);
>> + }
>> +
>> + let mut disk = gen_disk::try_new(tagset)?;
>> + disk.set_name(format_args!("rnullb{}", 0))?;
>> + disk.set_capacity_sectors(4096 << 11);
>> + disk.set_queue_logical_block_size(block_size.into());
>> + disk.set_queue_physical_block_size(block_size.into());
>> + disk.set_rotational(false);
>> + disk.add()
>> +}
>> +
>> +impl kernel::Module for NullBlkModule {
>> + fn init(_module: &'static ThisModule) -> Result<Self> {
>> + pr_info!("Rust null_blk loaded\n");
>> + let tagset = Arc::pin_init(TagSet::try_new(1, 256, 1), flags::GFP_KERNEL)?;
>> + let disk = Box::pin_init(
>> + new_mutex!(add_disk(tagset)?, "nullb:disk"),
>> + flags::GFP_KERNEL,
>> + )?;
>> +
>> + Ok(Self { _disk: disk })
>> + }
>> +}
>> +
>> +struct NullBlkDevice;
>> +
>> +#[vtable]
>> +impl Operations for NullBlkDevice {
>> + #[inline(always)]
>> + fn queue_rq(rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result {
>> + mq::Request::end_ok(rq)
>> + .map_err(|_e| kernel::error::code::EIO)
>> + .expect("Failed to complete request");
>
> This error would only happen if `rq` is not the only ARef to that
> request, right?
Yes, it should never happen. If it happens, something is seriously
broken and panic is adequate.
Other drivers might want to retry later or something, but in this case
it should just work.
>
>> +
>> + Ok(())
>> + }
>> +
>> + fn commit_rqs() {}
>> +
>> + fn complete(rq: ARef<mq::Request<Self>>) {
>
> Am I correct in thinking that this function is never actually called,
> since all requests that are queued are immediately ended?
Yes, re my other email. It is a callback that cannot be triggered for
now. I will move it to a later patch where it belongs.
>
>> + mq::Request::end_ok(rq)
>> + .map_err(|_e| kernel::error::code::EIO)
>> + .expect("Failed to complete request")
>> + }
>> +}
>> diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs
>> index efbd2588791b..54e032bbdffd 100644
>> --- a/rust/kernel/block/mq.rs
>> +++ b/rust/kernel/block/mq.rs
>> @@ -51,6 +51,7 @@
>> //!
>> //! ```rust
>> //! use kernel::{
>> +//! alloc::flags,
>> //! block::mq::*,
>> //! new_mutex,
>> //! prelude::*,
>> @@ -77,7 +78,8 @@
>> //! }
>> //! }
>> //!
>> -//! let tagset: Arc<TagSet<MyBlkDevice>> = Arc::pin_init(TagSet::try_new(1, 256, 1))?;
>> +//! let tagset: Arc<TagSet<MyBlkDevice>> =
>> +//! Arc::pin_init(TagSet::try_new(1, 256, 1), flags::GFP_KERNEL)?;
>
> This change should probably be in the patch before (seems like an
> artifact from rebasing).
Yes, thank you for spotting that. I thought I checked that this was
building, so this is strange to me. Maybe I failed to build the
doctests between the two patches. It is weird that kernel robot did not
pick this up. Maybe it is not building doctests?
Best regards,
Andreas
On 29.05.24 15:00, Andreas Hindborg wrote:
> Benno Lossin <[email protected]> writes:
>
>> On 21.05.24 16:03, Andreas Hindborg wrote:
>>> From: Andreas Hindborg <[email protected]>
>>>
>>> This patch adds an initial version of the Rust null block driver.
>>>
>>> Signed-off-by: Andreas Hindborg <[email protected]>
>>> ---
>>> drivers/block/Kconfig | 9 +++++
>>> drivers/block/Makefile | 3 ++
>>> drivers/block/rnull.rs | 86 +++++++++++++++++++++++++++++++++++++++++
>>> rust/kernel/block/mq.rs | 4 +-
>>> 4 files changed, 101 insertions(+), 1 deletion(-)
>>> create mode 100644 drivers/block/rnull.rs
>>>
>>> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
>>> index 5b9d4aaebb81..ed209f4f2798 100644
>>> --- a/drivers/block/Kconfig
>>> +++ b/drivers/block/Kconfig
>>> @@ -354,6 +354,15 @@ config VIRTIO_BLK
>>> This is the virtual block driver for virtio. It can be used with
>>> QEMU based VMMs (like KVM or Xen). Say Y or M.
>>>
>>> +config BLK_DEV_RUST_NULL
>>> + tristate "Rust null block driver (Experimental)"
>>> + depends on RUST
>>> + help
>>> + This is the Rust implementation of the null block driver. For now it
>>> + is only a minimal stub.
>>> +
>>> + If unsure, say N.
>>> +
>>> config BLK_DEV_RBD
>>> tristate "Rados block device (RBD)"
>>> depends on INET && BLOCK
>>> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
>>> index 101612cba303..1105a2d4fdcb 100644
>>> --- a/drivers/block/Makefile
>>> +++ b/drivers/block/Makefile
>>> @@ -9,6 +9,9 @@
>>> # needed for trace events
>>> ccflags-y += -I$(src)
>>>
>>> +obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o
>>> +rnull_mod-y := rnull.o
>>> +
>>> obj-$(CONFIG_MAC_FLOPPY) += swim3.o
>>> obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o
>>> obj-$(CONFIG_BLK_DEV_FD) += floppy.o
>>> diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs
>>> new file mode 100644
>>> index 000000000000..1d6ab6f0f26f
>>> --- /dev/null
>>> +++ b/drivers/block/rnull.rs
>>> @@ -0,0 +1,86 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +
>>> +//! This is a Rust implementation of the C null block driver.
>>> +//!
>>> +//! Supported features:
>>> +//!
>>> +//! - blk-mq interface
>>> +//! - direct completion
>>> +//! - block size 4k
>>> +//!
>>> +//! The driver is not configurable.
>>> +
>>> +use kernel::{
>>> + alloc::flags,
>>> + block::mq::{
>>> + self,
>>> + gen_disk::{self, GenDisk},
>>> + Operations, TagSet,
>>> + },
>>> + error::Result,
>>> + new_mutex, pr_info,
>>> + prelude::*,
>>> + sync::{Arc, Mutex},
>>> + types::ARef,
>>> +};
>>> +
>>> +module! {
>>> + type: NullBlkModule,
>>> + name: "rnull_mod",
>>> + author: "Andreas Hindborg",
>>> + license: "GPL v2",
>>> +}
>>> +
>>> +struct NullBlkModule {
>>> + _disk: Pin<Box<Mutex<GenDisk<NullBlkDevice, gen_disk::Added>>>>,
>>> +}
>>> +
>>> +fn add_disk(tagset: Arc<TagSet<NullBlkDevice>>) -> Result<GenDisk<NullBlkDevice, gen_disk::Added>> {
>>
>> Any reason that this is its own function and not in the
>> `NullBlkModule::init` function?
>
> Would you embed it inside the `init` function? To what end? I don't
> think that would read well.
I just found it strange that you have this extracted into its own
function, since I just expected it to be present in the init function.
Does this look really that bad?:
impl kernel::Module for NullBlkModule {
fn init(_module: &'static ThisModule) -> Result<Self> {
pr_info!("Rust null_blk loaded\n");
let block_size: u16 = 4096;
if block_size % 512 != 0 ||
!(512..=4096).contains(&block_size) {
return Err(kernel::error::code::EINVAL);
}
let disk = {
let tagset = Arc::pin_init(TagSet::try_new(1, 256, 1),
flags::GFP_KERNEL)?;
let mut disk = gen_disk::try_new(tagset)?;
disk.set_name(format_args!("rnullb{}", 0))?;
disk.set_capacity_sectors(4096 << 11);
disk.set_queue_logical_block_size(block_size.into());
disk.set_queue_physical_block_size(block_size.into());
disk.set_rotational(false);
disk.add()
};
let disk = Box::pin_init(
new_mutex!(disk, "nullb:disk"),
flags::GFP_KERNEL,
)?;
Ok(Self { _disk: disk })
}
}
>>> + let block_size: u16 = 4096;
>>> + if block_size % 512 != 0 || !(512..=4096).contains(&block_size) {
>>> + return Err(kernel::error::code::EINVAL);
>>> + }
>>> +
>>> + let mut disk = gen_disk::try_new(tagset)?;
>>> + disk.set_name(format_args!("rnullb{}", 0))?;
>>> + disk.set_capacity_sectors(4096 << 11);
>>> + disk.set_queue_logical_block_size(block_size.into());
>>> + disk.set_queue_physical_block_size(block_size.into());
>>> + disk.set_rotational(false);
>>> + disk.add()
>>> +}
>>> +
>>> +impl kernel::Module for NullBlkModule {
>>> + fn init(_module: &'static ThisModule) -> Result<Self> {
>>> + pr_info!("Rust null_blk loaded\n");
>>> + let tagset = Arc::pin_init(TagSet::try_new(1, 256, 1), flags::GFP_KERNEL)?;
>>> + let disk = Box::pin_init(
>>> + new_mutex!(add_disk(tagset)?, "nullb:disk"),
>>> + flags::GFP_KERNEL,
>>> + )?;
>>> +
>>> + Ok(Self { _disk: disk })
>>> + }
>>> +}
>>> +
>>> +struct NullBlkDevice;
>>> +
>>> +#[vtable]
>>> +impl Operations for NullBlkDevice {
>>> + #[inline(always)]
>>> + fn queue_rq(rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result {
>>> + mq::Request::end_ok(rq)
>>> + .map_err(|_e| kernel::error::code::EIO)
>>> + .expect("Failed to complete request");
>>
>> This error would only happen if `rq` is not the only ARef to that
>> request, right?
>
> Yes, it should never happen. If it happens, something is seriously
> broken and panic is adequate.
>
> Other drivers might want to retry later or something, but in this case
> it should just work.
In that case, I think the error message should reflect that and not just
be a generic "failed to complete request" error.
>>> +
>>> + Ok(())
>>> + }
>>> +
>>> + fn commit_rqs() {}
>>> +
>>> + fn complete(rq: ARef<mq::Request<Self>>) {
>>
>> Am I correct in thinking that this function is never actually called,
>> since all requests that are queued are immediately ended?
>
> Yes, re my other email. It is a callback that cannot be triggered for
> now. I will move it to a later patch where it belongs.
>
>>
>>> + mq::Request::end_ok(rq)
>>> + .map_err(|_e| kernel::error::code::EIO)
>>> + .expect("Failed to complete request")
>>> + }
>>> +}
>>> diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs
>>> index efbd2588791b..54e032bbdffd 100644
>>> --- a/rust/kernel/block/mq.rs
>>> +++ b/rust/kernel/block/mq.rs
>>> @@ -51,6 +51,7 @@
>>> //!
>>> //! ```rust
>>> //! use kernel::{
>>> +//! alloc::flags,
>>> //! block::mq::*,
>>> //! new_mutex,
>>> //! prelude::*,
>>> @@ -77,7 +78,8 @@
>>> //! }
>>> //! }
>>> //!
>>> -//! let tagset: Arc<TagSet<MyBlkDevice>> = Arc::pin_init(TagSet::try_new(1, 256, 1))?;
>>> +//! let tagset: Arc<TagSet<MyBlkDevice>> =
>>> +//! Arc::pin_init(TagSet::try_new(1, 256, 1), flags::GFP_KERNEL)?;
>>
>> This change should probably be in the patch before (seems like an
>> artifact from rebasing).
>
> Yes, thank you for spotting that. I thought I checked that this was
> building, so this is strange to me. Maybe I failed to build the
> doctests between the two patches. It is weird that kernel robot did not
> pick this up. Maybe it is not building doctests?
Hmm, that is strange...
---
Cheers,
Benno
Benno Lossin <[email protected]> writes:
[...]
>>>> +
>>>> +fn add_disk(tagset: Arc<TagSet<NullBlkDevice>>) -> Result<GenDisk<NullBlkDevice, gen_disk::Added>> {
>>>
>>> Any reason that this is its own function and not in the
>>> `NullBlkModule::init` function?
>>
>> Would you embed it inside the `init` function? To what end? I don't
>> think that would read well.
>
> I just found it strange that you have this extracted into its own
> function, since I just expected it to be present in the init function.
> Does this look really that bad?:
>
> impl kernel::Module for NullBlkModule {
> fn init(_module: &'static ThisModule) -> Result<Self> {
> pr_info!("Rust null_blk loaded\n");
> let block_size: u16 = 4096;
> if block_size % 512 != 0 ||
> !(512..=4096).contains(&block_size) {
> return Err(kernel::error::code::EINVAL);
> }
>
> let disk = {
> let tagset = Arc::pin_init(TagSet::try_new(1, 256, 1),
> flags::GFP_KERNEL)?;
> let mut disk = gen_disk::try_new(tagset)?;
> disk.set_name(format_args!("rnullb{}", 0))?;
> disk.set_capacity_sectors(4096 << 11);
> disk.set_queue_logical_block_size(block_size.into());
> disk.set_queue_physical_block_size(block_size.into());
> disk.set_rotational(false);
> disk.add()
> };
> let disk = Box::pin_init(
> new_mutex!(disk, "nullb:disk"),
> flags::GFP_KERNEL,
> )?;
>
> Ok(Self { _disk: disk })
> }
> }
I don't mind either way. I guess we could combine it.
[...]
>>>> +#[vtable]
>>>> +impl Operations for NullBlkDevice {
>>>> + #[inline(always)]
>>>> + fn queue_rq(rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result {
>>>> + mq::Request::end_ok(rq)
>>>> + .map_err(|_e| kernel::error::code::EIO)
>>>> + .expect("Failed to complete request");
>>>
>>> This error would only happen if `rq` is not the only ARef to that
>>> request, right?
>>
>> Yes, it should never happen. If it happens, something is seriously
>> broken and panic is adequate.
>>
>> Other drivers might want to retry later or something, but in this case
>> it should just work.
>
> In that case, I think the error message should reflect that and not just
> be a generic "failed to complete request" error.
Right, that is a good point.
Best regards,
Andreas
Benno Lossin <[email protected]> writes:
[...]
>>>> + /// Notify the block layer that the request has been completed without errors.
>>>> + ///
>>>> + /// This function will return `Err` if `this` is not the only `ARef`
>>>> + /// referencing the request.
>>>> + pub fn end_ok(this: ARef<Self>) -> Result<(), ARef<Self>> {
>>>
>>> I am not yet fully convinced that this is the way we should go. I think
>>> I would have to see a more complex usage of `Request` with that id <->
>>> request mapping that you mentioned. Because with how rnull uses this
>>> API, it could also have a `URef<Self>` parameter (URef := unique ARef).
>>
>> I considered a `UniqueARef` but it would just move the error handing to
>> `ARef::into_unique` and then make `end_ok` infallible.
>>
>> There are four states for a request that we need to track:
>>
>> A) Request is owned by block layer (refcount 0)
>> B) Request is owned by driver but with zero `ARef`s in existence
>> (refcount 1)
>> C) Request is owned by driver with exactly one `ARef` in existence
>> (refcount 2)
>> D) Request is owned by driver with more than one `ARef` in existence
>> (refcount > 2)
>>
>> It is in the doc comments for `RequestDataWrapper` as well.
>>
>> We need A and B to ensure we fail tag to request conversions for
>> requests that are not owned by the driver.
>>
>> We need C and D to ensure that it is safe to end the request and hand back
>> ownership to the block layer.
>>
>> I will ping you when I hook up the NVMe driver with this.
>
> Thanks. I think that since the C side doesn't use ref-counting, the
> lifecycle of a request is probably rather simple. Therefore we should
> try to also avoid refcounting in Rust and see if we can eg tie ending
> requests to the associated `TagSet` (ie require `&mut` on the tagset)
> and tie accessing requests to shared access to the `TagSet`. Then we
> would be able to avoid the refcount. But I will first have to take a
> look at the nvme driver to gauge the plausibility.
C side _does_ use ref-counting in the `ref` field of the C `struct
request`. I am not able to reuse that field for the state tracking I
need. Other users such as iostat will take references on the request and
we will not be able to tell if there are no more Rust refs to the
request from that field. We need a separate one.
Anyways I think we should go with the current implementation for now. We
can always change it, nothing is locked in stone.
[...]
>>>> +/// Store the result of `op(target.load())` in target, returning new value of
>>>> +/// taret.
>>>> +fn atomic_relaxed_op_return(target: &AtomicU64, op: impl Fn(u64) -> u64) -> u64 {
>>>> + let mut old = target.load(Ordering::Relaxed);
>>>> + loop {
>>>> + match target.compare_exchange_weak(old, op(old), Ordering::Relaxed, Ordering::Relaxed) {
>>>> + Ok(_) => break,
>>>> + Err(x) => {
>>>> + old = x;
>>>> + }
>>>> + }
>>>> + }
>>>
>>> This seems like a reimplementation of `AtomicU64::fetch_update` to me.
>>
>> It looks like it! Except this function is returning the updated value,
>> `fetch_update` is returning the old value.
>>
>> Would you rather that I rewrite in terms of the library function?
>
> If you can just use the fetch_update function, then that would be better
> than (almost) reimplementing it. But if you really need to get the new
> value, then I guess it can't really be helped. (or do you think you can
> just apply `op` to the old value returned by `fetch_update`?)
I can implement `atomic_relaxed_op_return` in terms of `fetch_update` ????
[...]
>>>> + let place = place.cast::<bindings::blk_mq_tag_set>();
>>>> +
>>>> + // SAFETY: try_ffi_init promises that `place` is writable, and
>>>> + // zeroes is a valid bit pattern for this structure.
>>>> + core::ptr::write_bytes(place, 0, 1);
>>>> +
>>>> + /// For a raw pointer to a struct, write a struct field without
>>>> + /// creating a reference to the field
>>>> + macro_rules! write_ptr_field {
>>>> + ($target:ident, $field:ident, $value:expr) => {
>>>> + ::core::ptr::write(::core::ptr::addr_of_mut!((*$target).$field), $value)
>>>> + };
>>>> + }
>>>> +
>>>> + // SAFETY: try_ffi_init promises that `place` is writable
>>>> + write_ptr_field!(place, ops, OperationsVTable::<T>::build());
>>>> + write_ptr_field!(place, nr_hw_queues , nr_hw_queues);
>>>> + write_ptr_field!(place, timeout , 0); // 0 means default which is 30 * HZ in C
>>>> + write_ptr_field!(place, numa_node , bindings::NUMA_NO_NODE);
>>>> + write_ptr_field!(place, queue_depth , num_tags);
>>>> + write_ptr_field!(place, cmd_size , core::mem::size_of::<RequestDataWrapper>().try_into()?);
>>>> + write_ptr_field!(place, flags , bindings::BLK_MQ_F_SHOULD_MERGE);
>>>> + write_ptr_field!(place, driver_data , core::ptr::null_mut::<core::ffi::c_void>());
>>>> + write_ptr_field!(place, nr_maps , num_maps);
>>>
>>> I think that there is some way for pinned-init to do a better job here.
>>> I feel like we ought to be able to just write:
>>>
>>> Opaque::init(
>>> try_init!(bindings::blk_mq_tag_set {
>>> ops: OperationsVTable::<T>::build(),
>>> nr_hw_queues,
>>> timeout: 0, // 0 means default, which is 30Hz
>>> numa_node: bindings::NUMA_NO_NODE,
>>> queue_depth: num_tags,
>>> cmd_size: size_of::<RequestDataWrapper>().try_into()?,
>>> flags: bindings::BLK_MQ_F_SHOULD_MERGE,
>>> driver_data: null_mut(),
>>> nr_maps: num_maps,
>>> ..Zeroable::zeroed()
>>> }? Error)
>>> .chain(|tag_set| to_result(bindings::blk_mq_alloc_tag_set(tag_set)))
>>> )
>>>
>>> But we don't have `Opaque::init` (shouldn't be difficult) and
>>> `bindings::blk_mq_tag_set` doesn't implement `Zeroable`. We would need
>>> bindgen to put `derive(Zeroable)` on certain structs...
>>>
>>> Another option would be to just list the fields explicitly, since there
>>> aren't that many. What do you think?
>>
>> Both options sound good. Ofc the first one sounds more user friendly
>> while the latter one sounds easier to implement. Getting rid of the
>> unsafe blocks here would be really nice.
>
> I think since it is not that expensive in this case, you should go for
> the second approach.
> We can fix it later, when we get the proper bindgen support.
Cool, I will send a follow up patch with this ????
Best regards,
Andreas