From: Long Li <[email protected]>
Azure Blob storage provides scalable and durable data storage for Azure.
(https://azure.microsoft.com/en-us/services/storage/blobs/)
This driver adds support for accelerated access to Azure Blob storage. As an
alternative to REST APIs, it provides a fast data path that uses host native
network stack and secure direct data link for storage server access.
Cc: Jonathan Corbet <[email protected]>
Cc: "K. Y. Srinivasan" <[email protected]>
Cc: Haiyang Zhang <[email protected]>
Cc: Stephen Hemminger <[email protected]>
Cc: Wei Liu <[email protected]>
Cc: Dexuan Cui <[email protected]>
Cc: Greg Kroah-Hartman <[email protected]>
Cc: Bjorn Andersson <[email protected]>
Cc: Hans de Goede <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Maximilian Luz <[email protected]>
Cc: Mike Rapoport <[email protected]>
Cc: Ben Widawsky <[email protected]>
Cc: Jiri Slaby <[email protected]>
Cc: Andra Paraschiv <[email protected]>
Cc: Siddharth Gupta <[email protected]>
Cc: Hannes Reinecke <[email protected]>
Cc: [email protected]
Signed-off-by: Long Li <[email protected]>
---
Documentation/userspace-api/ioctl/ioctl-number.rst | 2 +
drivers/hv/Kconfig | 10 +
drivers/hv/Makefile | 1 +
drivers/hv/azure_blob.c | 655 +++++++++++++++++++++
drivers/hv/channel_mgmt.c | 7 +
include/linux/hyperv.h | 9 +
include/uapi/misc/azure_blob.h | 31 +
7 files changed, 715 insertions(+)
create mode 100644 drivers/hv/azure_blob.c
create mode 100644 include/uapi/misc/azure_blob.h
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 9bfc2b5..d3c2a90 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -180,6 +180,8 @@ Code Seq# Include File Comments
'R' 01 linux/rfkill.h conflict!
'R' C0-DF net/bluetooth/rfcomm.h
'R' E0 uapi/linux/fsl_mc.h
+'R' F0-FF uapi/misc/azure_blob.h Microsoft Azure Blob driver
+ <mailto:[email protected]>
'S' all linux/cdrom.h conflict!
'S' 80-81 scsi/scsi_ioctl.h conflict!
'S' 82-FF scsi/scsi.h conflict!
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 66c794d..e08b8d3 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -27,4 +27,14 @@ config HYPERV_BALLOON
help
Select this option to enable Hyper-V Balloon driver.
+config HYPERV_AZURE_BLOB
+ tristate "Microsoft Azure Blob driver"
+ depends on HYPERV && X86_64
+ help
+ Select this option to enable Microsoft Azure Blob driver.
+
+ This driver supports accelerated Microsoft Azure Blob access.
+ To compile this driver as a module, choose M here. The module will be
+ called azure_blob.
+
endmenu
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index 94daf82..a322575 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -2,6 +2,7 @@
obj-$(CONFIG_HYPERV) += hv_vmbus.o
obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
+obj-$(CONFIG_HYPERV_AZURE_BLOB) += azure_blob.o
CFLAGS_hv_trace.o = -I$(src)
CFLAGS_hv_balloon.o = -I$(src)
diff --git a/drivers/hv/azure_blob.c b/drivers/hv/azure_blob.c
new file mode 100644
index 0000000..6c8f957
--- /dev/null
+++ b/drivers/hv/azure_blob.c
@@ -0,0 +1,655 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/* Copyright (c) 2021, Microsoft Corporation. */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/debugfs.h>
+#include <linux/pagemap.h>
+#include <linux/hyperv.h>
+#include <linux/miscdevice.h>
+#include <linux/uio.h>
+#include <uapi/misc/azure_blob.h>
+
+struct az_blob_device {
+ struct hv_device *device;
+
+ /* Opened files maintained by this device */
+ struct list_head file_list;
+ spinlock_t file_lock;
+ wait_queue_head_t file_wait;
+
+ bool removing;
+};
+
+/* VSP messages */
+enum az_blob_vsp_request_type {
+ AZ_BLOB_DRIVER_REQUEST_FIRST = 0x100,
+ AZ_BLOB_DRIVER_USER_REQUEST = 0x100,
+ AZ_BLOB_DRIVER_REGISTER_BUFFER = 0x101,
+ AZ_BLOB_DRIVER_DEREGISTER_BUFFER = 0x102,
+};
+
+/* VSC->VSP request */
+struct az_blob_vsp_request {
+ u32 version;
+ u32 timeout_ms;
+ u32 data_buffer_offset;
+ u32 data_buffer_length;
+ u32 data_buffer_valid;
+ u32 operation_type;
+ u32 request_buffer_offset;
+ u32 request_buffer_length;
+ u32 response_buffer_offset;
+ u32 response_buffer_length;
+ guid_t transaction_id;
+} __packed;
+
+/* VSP->VSC response */
+struct az_blob_vsp_response {
+ u32 length;
+ u32 error;
+ u32 response_len;
+} __packed;
+
+struct az_blob_vsp_request_ctx {
+ struct list_head list;
+ struct completion wait_vsp;
+ struct az_blob_request_sync *request;
+};
+
+struct az_blob_file_ctx {
+ struct list_head list;
+
+ /* List of pending requests to VSP */
+ struct list_head vsp_pending_requests;
+ spinlock_t vsp_pending_lock;
+ wait_queue_head_t wait_vsp_pending;
+};
+
+/* The maximum number of pages we can pass to VSP in a single packet */
+#define AZ_BLOB_MAX_PAGES 8192
+
+#ifdef CONFIG_DEBUG_FS
+struct dentry *az_blob_debugfs_root;
+#endif
+
+static struct az_blob_device az_blob_dev;
+
+static int az_blob_ringbuffer_size = (128 * 1024);
+module_param(az_blob_ringbuffer_size, int, 0444);
+MODULE_PARM_DESC(az_blob_ringbuffer_size, "Ring buffer size (bytes)");
+
+static const struct hv_vmbus_device_id id_table[] = {
+ { HV_AZURE_BLOB_GUID,
+ .driver_data = 0
+ },
+ { },
+};
+
+#define AZ_ERR 0
+#define AZ_WARN 1
+#define AZ_DBG 2
+static int log_level = AZ_ERR;
+module_param(log_level, int, 0644);
+MODULE_PARM_DESC(log_level,
+ "Log level: 0 - Error (default), 1 - Warning, 2 - Debug.");
+
+static uint device_queue_depth = 1024;
+module_param(device_queue_depth, uint, 0444);
+MODULE_PARM_DESC(device_queue_depth,
+ "System level max queue depth for this device");
+
+#define az_blob_log(level, fmt, args...) \
+do { \
+ if (level <= log_level) \
+ pr_err("%s:%d " fmt, __func__, __LINE__, ##args); \
+} while (0)
+
+#define az_blob_dbg(fmt, args...) az_blob_log(AZ_DBG, fmt, ##args)
+#define az_blob_warn(fmt, args...) az_blob_log(AZ_WARN, fmt, ##args)
+#define az_blob_err(fmt, args...) az_blob_log(AZ_ERR, fmt, ##args)
+
+static void az_blob_on_channel_callback(void *context)
+{
+ struct vmbus_channel *channel = (struct vmbus_channel *)context;
+ const struct vmpacket_descriptor *desc;
+
+ az_blob_dbg("entering interrupt from vmbus\n");
+ foreach_vmbus_pkt(desc, channel) {
+ struct az_blob_vsp_request_ctx *request_ctx;
+ struct az_blob_vsp_response *response;
+ u64 cmd_rqst = vmbus_request_addr(&channel->requestor,
+ desc->trans_id);
+ if (cmd_rqst == VMBUS_RQST_ERROR) {
+ az_blob_err("incorrect transaction id %llu\n",
+ desc->trans_id);
+ continue;
+ }
+ request_ctx = (struct az_blob_vsp_request_ctx *) cmd_rqst;
+ response = hv_pkt_data(desc);
+
+ az_blob_dbg("got response for request %pUb status %u "
+ "response_len %u\n",
+ &request_ctx->request->guid, response->error,
+ response->response_len);
+ request_ctx->request->response.status = response->error;
+ request_ctx->request->response.response_len =
+ response->response_len;
+ complete(&request_ctx->wait_vsp);
+ }
+
+}
+
+static int az_blob_fop_open(struct inode *inode, struct file *file)
+{
+ struct az_blob_file_ctx *file_ctx;
+ unsigned long flags;
+
+
+ file_ctx = kzalloc(sizeof(*file_ctx), GFP_KERNEL);
+ if (!file_ctx)
+ return -ENOMEM;
+
+
+ spin_lock_irqsave(&az_blob_dev.file_lock, flags);
+ if (az_blob_dev.removing) {
+ spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
+ kfree(file_ctx);
+ return -ENODEV;
+ }
+ list_add_tail(&file_ctx->list, &az_blob_dev.file_list);
+ spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
+
+ INIT_LIST_HEAD(&file_ctx->vsp_pending_requests);
+ init_waitqueue_head(&file_ctx->wait_vsp_pending);
+ file->private_data = file_ctx;
+
+ return 0;
+}
+
+static int az_blob_fop_release(struct inode *inode, struct file *file)
+{
+ struct az_blob_file_ctx *file_ctx = file->private_data;
+ unsigned long flags;
+
+ wait_event(file_ctx->wait_vsp_pending,
+ list_empty(&file_ctx->vsp_pending_requests));
+
+ spin_lock_irqsave(&az_blob_dev.file_lock, flags);
+ list_del(&file_ctx->list);
+ if (list_empty(&az_blob_dev.file_list))
+ wake_up(&az_blob_dev.file_wait);
+ spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
+
+ return 0;
+}
+
+static inline bool az_blob_safe_file_access(struct file *file)
+{
+ return file->f_cred == current_cred() && !uaccess_kernel();
+}
+
+static int get_buffer_pages(int rw, void __user *buffer, u32 buffer_len,
+ struct page ***ppages, size_t *start, size_t *num_pages)
+{
+ struct iovec iov;
+ struct iov_iter iter;
+ int ret;
+ ssize_t result;
+ struct page **pages;
+
+ ret = import_single_range(rw, buffer, buffer_len, &iov, &iter);
+ if (ret) {
+ az_blob_dbg("request buffer access error %d\n", ret);
+ return ret;
+ }
+ az_blob_dbg("iov_iter type %d offset %lu count %lu nr_segs %lu\n",
+ iter.type, iter.iov_offset, iter.count, iter.nr_segs);
+
+ result = iov_iter_get_pages_alloc(&iter, &pages, buffer_len, start);
+ if (result < 0) {
+ az_blob_dbg("failed to pin user pages result=%ld\n", result);
+ return result;
+ }
+ if (result != buffer_len) {
+ az_blob_dbg("can't pin user pages requested %d got %ld\n",
+ buffer_len, result);
+ return -EFAULT;
+ }
+
+ *ppages = pages;
+ *num_pages = (result + *start + PAGE_SIZE - 1) / PAGE_SIZE;
+ return 0;
+}
+
+static void fill_in_page_buffer(u64 *pfn_array,
+ int *index, struct page **pages, unsigned long num_pages)
+{
+ int i, page_idx = *index;
+
+ for (i = 0; i < num_pages; i++)
+ pfn_array[page_idx++] = page_to_pfn(pages[i]);
+ *index = page_idx;
+}
+
+static void free_buffer_pages(size_t num_pages, struct page **pages)
+{
+ unsigned long i;
+
+ for (i = 0; i < num_pages; i++)
+ if (pages[i])
+ put_page(pages[i]);
+ kvfree(pages);
+}
+
+static long az_blob_ioctl_user_request(struct file *filp, unsigned long arg)
+{
+ struct az_blob_device *dev = &az_blob_dev;
+ struct az_blob_file_ctx *file_ctx = filp->private_data;
+ char __user *argp = (char __user *) arg;
+ struct az_blob_request_sync request;
+ struct az_blob_vsp_request_ctx request_ctx;
+ unsigned long flags;
+ int ret;
+ size_t request_start, request_num_pages = 0;
+ size_t response_start, response_num_pages = 0;
+ size_t data_start, data_num_pages = 0, total_num_pages;
+ struct page **request_pages = NULL, **response_pages = NULL;
+ struct page **data_pages = NULL;
+ struct vmbus_packet_mpb_array *desc;
+ u64 *pfn_array;
+ int desc_size;
+ int page_idx;
+ struct az_blob_vsp_request *vsp_request;
+
+ /* Fast fail if device is being removed */
+ if (dev->removing)
+ return -ENODEV;
+
+ if (!az_blob_safe_file_access(filp)) {
+ az_blob_dbg("process %d(%s) changed security contexts after"
+ " opening file descriptor\n",
+ task_tgid_vnr(current), current->comm);
+ return -EACCES;
+ }
+
+ if (copy_from_user(&request, argp, sizeof(request))) {
+ az_blob_dbg("don't have permission to user provided buffer\n");
+ return -EFAULT;
+ }
+
+ az_blob_dbg("az_blob ioctl request guid %pUb timeout %u request_len %u"
+ " response_len %u data_len %u request_buffer %llx "
+ "response_buffer %llx data_buffer %llx\n",
+ &request.guid, request.timeout, request.request_len,
+ request.response_len, request.data_len, request.request_buffer,
+ request.response_buffer, request.data_buffer);
+
+ if (!request.request_len || !request.response_len)
+ return -EINVAL;
+
+ if (request.data_len && request.data_len < request.data_valid)
+ return -EINVAL;
+
+ init_completion(&request_ctx.wait_vsp);
+ request_ctx.request = &request;
+
+ /*
+ * Need to set rw to READ to have page table set up for passing to VSP.
+ * Setting it to WRITE will cause the page table entry not allocated
+ * as it's waiting on Copy-On-Write on next page fault. This doesn't
+ * work in this scenario because VSP wants all the pages to be present.
+ */
+ ret = get_buffer_pages(READ, (void __user *) request.request_buffer,
+ request.request_len, &request_pages, &request_start,
+ &request_num_pages);
+ if (ret)
+ goto get_user_page_failed;
+
+ ret = get_buffer_pages(READ, (void __user *) request.response_buffer,
+ request.response_len, &response_pages, &response_start,
+ &response_num_pages);
+ if (ret)
+ goto get_user_page_failed;
+
+ if (request.data_len) {
+ ret = get_buffer_pages(READ,
+ (void __user *) request.data_buffer, request.data_len,
+ &data_pages, &data_start, &data_num_pages);
+ if (ret)
+ goto get_user_page_failed;
+ }
+
+ total_num_pages = request_num_pages + response_num_pages +
+ data_num_pages;
+ if (total_num_pages > AZ_BLOB_MAX_PAGES) {
+ az_blob_dbg("number of DMA pages %lu buffer exceeding %u\n",
+ total_num_pages, AZ_BLOB_MAX_PAGES);
+ ret = -EINVAL;
+ goto get_user_page_failed;
+ }
+
+ /* Construct a VMBUS packet and send it over to VSP */
+ desc_size = sizeof(struct vmbus_packet_mpb_array) +
+ sizeof(u64) * total_num_pages;
+ desc = kzalloc(desc_size, GFP_KERNEL);
+ vsp_request = kzalloc(sizeof(*vsp_request), GFP_KERNEL);
+ if (!desc || !vsp_request) {
+ kfree(desc);
+ kfree(vsp_request);
+ ret = -ENOMEM;
+ goto get_user_page_failed;
+ }
+
+ desc->range.offset = 0;
+ desc->range.len = total_num_pages * PAGE_SIZE;
+ pfn_array = desc->range.pfn_array;
+ page_idx = 0;
+
+ if (request.data_len) {
+ fill_in_page_buffer(pfn_array, &page_idx, data_pages,
+ data_num_pages);
+ vsp_request->data_buffer_offset = data_start;
+ vsp_request->data_buffer_length = request.data_len;
+ vsp_request->data_buffer_valid = request.data_valid;
+ }
+
+ fill_in_page_buffer(pfn_array, &page_idx, request_pages,
+ request_num_pages);
+ vsp_request->request_buffer_offset = request_start +
+ data_num_pages * PAGE_SIZE;
+ vsp_request->request_buffer_length = request.request_len;
+
+ fill_in_page_buffer(pfn_array, &page_idx, response_pages,
+ response_num_pages);
+ vsp_request->response_buffer_offset = response_start +
+ (data_num_pages + request_num_pages) * PAGE_SIZE;
+ vsp_request->response_buffer_length = request.response_len;
+
+ vsp_request->version = 0;
+ vsp_request->timeout_ms = request.timeout;
+ vsp_request->operation_type = AZ_BLOB_DRIVER_USER_REQUEST;
+ guid_copy(&vsp_request->transaction_id, &request.guid);
+
+ spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
+ list_add_tail(&request_ctx.list, &file_ctx->vsp_pending_requests);
+ spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
+
+ az_blob_dbg("sending request to VSP\n");
+ az_blob_dbg("desc_size %u desc->range.len %u desc->range.offset %u\n",
+ desc_size, desc->range.len, desc->range.offset);
+ az_blob_dbg("vsp_request data_buffer_offset %u data_buffer_length %u "
+ "data_buffer_valid %u request_buffer_offset %u "
+ "request_buffer_length %u response_buffer_offset %u "
+ "response_buffer_length %u\n",
+ vsp_request->data_buffer_offset,
+ vsp_request->data_buffer_length,
+ vsp_request->data_buffer_valid,
+ vsp_request->request_buffer_offset,
+ vsp_request->request_buffer_length,
+ vsp_request->response_buffer_offset,
+ vsp_request->response_buffer_length);
+
+ ret = vmbus_sendpacket_mpb_desc(dev->device->channel, desc, desc_size,
+ vsp_request, sizeof(*vsp_request), (u64) &request_ctx);
+
+ kfree(desc);
+ kfree(vsp_request);
+ if (ret)
+ goto vmbus_send_failed;
+
+ wait_for_completion(&request_ctx.wait_vsp);
+
+ /*
+ * At this point, the response is already written to request
+ * by VMBUS completion handler, copy them to user-mode buffers
+ * and return to user-mode
+ */
+ if (copy_to_user(argp +
+ offsetof(struct az_blob_request_sync,
+ response.status),
+ &request.response.status,
+ sizeof(request.response.status))) {
+ ret = -EFAULT;
+ goto vmbus_send_failed;
+ }
+
+ if (copy_to_user(argp +
+ offsetof(struct az_blob_request_sync,
+ response.response_len),
+ &request.response.response_len,
+ sizeof(request.response.response_len)))
+ ret = -EFAULT;
+
+vmbus_send_failed:
+ spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
+ list_del(&request_ctx.list);
+ if (list_empty(&file_ctx->vsp_pending_requests))
+ wake_up(&file_ctx->wait_vsp_pending);
+ spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
+
+get_user_page_failed:
+ free_buffer_pages(request_num_pages, request_pages);
+ free_buffer_pages(response_num_pages, response_pages);
+ free_buffer_pages(data_num_pages, data_pages);
+
+ return ret;
+}
+
+static long az_blob_fop_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ long ret = -EIO;
+
+ switch (cmd) {
+ case IOCTL_AZ_BLOB_DRIVER_USER_REQUEST:
+ if (_IOC_SIZE(cmd) != sizeof(struct az_blob_request_sync))
+ return -EINVAL;
+ ret = az_blob_ioctl_user_request(filp, arg);
+ break;
+
+ default:
+ az_blob_dbg("unrecognized IOCTL code %u\n", cmd);
+ }
+
+ return ret;
+}
+
+static const struct file_operations az_blob_client_fops = {
+ .owner = THIS_MODULE,
+ .open = az_blob_fop_open,
+ .unlocked_ioctl = az_blob_fop_ioctl,
+ .release = az_blob_fop_release,
+};
+
+static struct miscdevice az_blob_misc_device = {
+ MISC_DYNAMIC_MINOR,
+ "azure_blob",
+ &az_blob_client_fops,
+};
+
+static int az_blob_show_pending_requests(struct seq_file *m, void *v)
+{
+ unsigned long flags, flags2;
+ struct az_blob_vsp_request_ctx *request_ctx;
+ struct az_blob_file_ctx *file_ctx;
+
+ seq_puts(m, "List of pending requests\n");
+ seq_puts(m, "UUID request_len response_len data_len "
+ "request_buffer response_buffer data_buffer\n");
+ spin_lock_irqsave(&az_blob_dev.file_lock, flags);
+ list_for_each_entry(file_ctx, &az_blob_dev.file_list, list) {
+ spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags2);
+ list_for_each_entry(request_ctx,
+ &file_ctx->vsp_pending_requests, list) {
+ seq_printf(m, "%pUb ", &request_ctx->request->guid);
+ seq_printf(m, "%u ", request_ctx->request->request_len);
+ seq_printf(m,
+ "%u ", request_ctx->request->response_len);
+ seq_printf(m, "%u ", request_ctx->request->data_len);
+ seq_printf(m,
+ "%llx ", request_ctx->request->request_buffer);
+ seq_printf(m,
+ "%llx ", request_ctx->request->response_buffer);
+ seq_printf(m,
+ "%llx\n", request_ctx->request->data_buffer);
+ }
+ spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags2);
+ }
+ spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
+
+ return 0;
+}
+
+static int az_blob_debugfs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, az_blob_show_pending_requests, NULL);
+}
+
+static const struct file_operations az_blob_debugfs_fops = {
+ .open = az_blob_debugfs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+static void az_blob_remove_device(struct az_blob_device *dev)
+{
+ wait_event(dev->file_wait, list_empty(&dev->file_list));
+ misc_deregister(&az_blob_misc_device);
+#ifdef CONFIG_DEBUG_FS
+ debugfs_remove_recursive(az_blob_debugfs_root);
+#endif
+ /* At this point, we won't get any requests from user-mode */
+}
+
+static int az_blob_create_device(struct az_blob_device *dev)
+{
+ int rc;
+ struct dentry *d;
+
+ rc = misc_register(&az_blob_misc_device);
+ if (rc) {
+ az_blob_err("misc_register failed rc %d\n", rc);
+ return rc;
+ }
+
+#ifdef CONFIG_DEBUG_FS
+ az_blob_debugfs_root = debugfs_create_dir("az_blob", NULL);
+ if (!IS_ERR_OR_NULL(az_blob_debugfs_root)) {
+ d = debugfs_create_file("pending_requests", 0400,
+ az_blob_debugfs_root, NULL,
+ &az_blob_debugfs_fops);
+ if (IS_ERR_OR_NULL(d)) {
+ az_blob_warn("failed to create debugfs file\n");
+ debugfs_remove_recursive(az_blob_debugfs_root);
+ az_blob_debugfs_root = NULL;
+ }
+ } else
+ az_blob_warn("failed to create debugfs root\n");
+#endif
+
+ return 0;
+}
+
+static int az_blob_connect_to_vsp(struct hv_device *device, u32 ring_size)
+{
+ int ret;
+
+ spin_lock_init(&az_blob_dev.file_lock);
+ INIT_LIST_HEAD(&az_blob_dev.file_list);
+ init_waitqueue_head(&az_blob_dev.file_wait);
+
+ az_blob_dev.removing = false;
+
+ az_blob_dev.device = device;
+ device->channel->rqstor_size = device_queue_depth;
+
+ ret = vmbus_open(device->channel, ring_size, ring_size, NULL, 0,
+ az_blob_on_channel_callback, device->channel);
+
+ if (ret) {
+ az_blob_err("failed to connect to VSP ret %d\n", ret);
+ return ret;
+ }
+
+ hv_set_drvdata(device, &az_blob_dev);
+
+ return ret;
+}
+
+static void az_blob_remove_vmbus(struct hv_device *device)
+{
+ /* At this point, no VSC/VSP traffic is possible over vmbus */
+ hv_set_drvdata(device, NULL);
+ vmbus_close(device->channel);
+}
+
+static int az_blob_probe(struct hv_device *device,
+ const struct hv_vmbus_device_id *dev_id)
+{
+ int rc;
+
+ az_blob_dbg("probing device\n");
+
+ rc = az_blob_connect_to_vsp(device, az_blob_ringbuffer_size);
+ if (rc) {
+ az_blob_err("error connecting to VSP rc %d\n", rc);
+ return rc;
+ }
+
+ // create user-mode client library facing device
+ rc = az_blob_create_device(&az_blob_dev);
+ if (rc) {
+ az_blob_remove_vmbus(device);
+ return rc;
+ }
+
+ az_blob_dbg("successfully probed device\n");
+ return 0;
+}
+
+static int az_blob_remove(struct hv_device *dev)
+{
+ struct az_blob_device *device = hv_get_drvdata(dev);
+ unsigned long flags;
+
+ spin_lock_irqsave(&device->file_lock, flags);
+ device->removing = true;
+ spin_unlock_irqrestore(&device->file_lock, flags);
+
+ az_blob_remove_device(device);
+ az_blob_remove_vmbus(dev);
+ return 0;
+}
+
+static struct hv_driver az_blob_drv = {
+ .name = KBUILD_MODNAME,
+ .id_table = id_table,
+ .probe = az_blob_probe,
+ .remove = az_blob_remove,
+ .driver = {
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
+};
+
+static int __init az_blob_drv_init(void)
+{
+ int ret;
+
+ ret = vmbus_driver_register(&az_blob_drv);
+ return ret;
+}
+
+static void __exit az_blob_drv_exit(void)
+{
+ vmbus_driver_unregister(&az_blob_drv);
+}
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Microsoft Azure Blob driver");
+module_init(az_blob_drv_init);
+module_exit(az_blob_drv_exit);
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 0c75662..436fdeb 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -154,6 +154,13 @@
.allowed_in_isolated = false,
},
+ /* Azure Blob */
+ { .dev_type = HV_AZURE_BLOB,
+ HV_AZURE_BLOB_GUID,
+ .perf_device = false,
+ .allowed_in_isolated = false,
+ },
+
/* Unknown GUID */
{ .dev_type = HV_UNKNOWN,
.perf_device = false,
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index d1e59db..ac31362 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -772,6 +772,7 @@ enum vmbus_device_type {
HV_FCOPY,
HV_BACKUP,
HV_DM,
+ HV_AZURE_BLOB,
HV_UNKNOWN,
};
@@ -1350,6 +1351,14 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f)
/*
+ * Azure Blob GUID
+ * {0590b792-db64-45cc-81db-b8d70c577c9e}
+ */
+#define HV_AZURE_BLOB_GUID \
+ .guid = GUID_INIT(0x0590b792, 0xdb64, 0x45cc, 0x81, 0xdb, \
+ 0xb8, 0xd7, 0x0c, 0x57, 0x7c, 0x9e)
+
+/*
* Shutdown GUID
* {0e0b6031-5213-4934-818b-38d90ced39db}
*/
diff --git a/include/uapi/misc/azure_blob.h b/include/uapi/misc/azure_blob.h
new file mode 100644
index 0000000..29413fc
--- /dev/null
+++ b/include/uapi/misc/azure_blob.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* Copyright (c) 2021, Microsoft Corporation. */
+
+#ifndef _AZ_BLOB_H
+#define _AZ_BLOB_H
+
+/* user-mode sync request sent through ioctl */
+struct az_blob_request_sync_response {
+ __u32 status;
+ __u32 response_len;
+};
+
+struct az_blob_request_sync {
+ guid_t guid;
+ __u32 timeout;
+ __u32 request_len;
+ __u32 response_len;
+ __u32 data_len;
+ __u32 data_valid;
+ __aligned_u64 request_buffer;
+ __aligned_u64 response_buffer;
+ __aligned_u64 data_buffer;
+ struct az_blob_request_sync_response response;
+};
+
+#define AZ_BLOB_MAGIC_NUMBER 'R'
+#define IOCTL_AZ_BLOB_DRIVER_USER_REQUEST \
+ _IOWR(AZ_BLOB_MAGIC_NUMBER, 0xf0, \
+ struct az_blob_request_sync)
+
+#endif /* define _AZ_BLOB_H */
--
1.8.3.1
Hi,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linux/master]
[also build test ERROR on linus/master v5.13-rc7 next-20210625]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/longli-linuxonhyperv-com/Introduce-a-driver-to-support-host-accelerated-access-to-Microsoft-Azure-Blob/20210626-143213
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 0c18f29aae7ce3dadd26d8ee3505d07cc982df75
config: sh-buildonly-randconfig-r006-20210625 (attached as .config)
compiler: sh4-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/76fc89da311e853f8ecc323ad52b367eced9c730
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review longli-linuxonhyperv-com/Introduce-a-driver-to-support-host-accelerated-access-to-Microsoft-Azure-Blob/20210626-143213
git checkout 76fc89da311e853f8ecc323ad52b367eced9c730
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=sh
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
>> error: include/uapi/misc/azure_blob.h: missing "WITH Linux-syscall-note" for SPDX-License-Identifier
--
>> error: include/uapi/misc/azure_blob.h: missing "WITH Linux-syscall-note" for SPDX-License-Identifier
make[2]: *** [scripts/Makefile.headersinst:63: usr/include/misc/azure_blob.h] Error 1
make[2]: Target '__headers' not remade because of errors.
make[1]: *** [Makefile:1341: headers] Error 2
make[1]: Target 'headers_install' not remade because of errors.
make: *** [Makefile:215: __sub-make] Error 2
make: Target 'headers_install' not remade because of errors.
--
>> error: include/uapi/misc/azure_blob.h: missing "WITH Linux-syscall-note" for SPDX-License-Identifier
make[2]: *** [scripts/Makefile.headersinst:63: usr/include/misc/azure_blob.h] Error 1
make[2]: Target '__headers' not remade because of errors.
make[1]: *** [Makefile:1341: headers] Error 2
sh4-linux-gcc: error: command line option '-m4-nofpu' is not supported by this configuration
sh4-linux-gcc: error: command line option '-m4-nofpu' is not supported by this configuration
make[2]: *** [scripts/Makefile.build:117: scripts/mod/devicetable-offsets.s] Error 1
sh4-linux-gcc: error: command line option '-m4-nofpu' is not supported by this configuration
sh4-linux-gcc: error: command line option '-m4-nofpu' is not supported by this configuration
make[2]: *** [scripts/Makefile.build:272: scripts/mod/empty.o] Error 1
make[2]: Target '__build' not remade because of errors.
make[1]: *** [Makefile:1234: prepare0] Error 2
make[1]: Target 'prepare' not remade because of errors.
make: *** [Makefile:215: __sub-make] Error 2
make: Target 'prepare' not remade because of errors.
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]
On 26.06.21 08:30, [email protected] wrote:
Hi,
> From: Long Li <[email protected]>
>
> Azure Blob storage provides scalable and durable data storage for Azure.
> (https://azure.microsoft.com/en-us/services/storage/blobs/)
It's a bit hard to quickly find out the exact semantics of that (w/o
fully reading the library code ...). :(
> This driver adds support for accelerated access to Azure Blob storage. As an
> alternative to REST APIs, it provides a fast data path that uses host native
> network stack and secure direct data link for storage server access.
I don't think that yet another misc device with some very specific API
is a good thing. We should have a more generic userland interface.
Maybe a file system ?
OTOH, I've been considering adding other kind of blob stores (eg. venti
and similar things) and inventing an own subsys for that, so we'd have
some more universal interface. Unfortunately always been busy with other
things.
Maybe we should go that route instead of introducing yet another device
specific uapi.
--mtx
--
---
Hinweis: unverschlüsselte E-Mails können leicht abgehört und manipuliert
werden ! Für eine vertrauliche Kommunikation senden Sie bitte ihren
GPG/PGP-Schlüssel zu.
---
Enrico Weigelt, metux IT consult
Free software and Linux embedded engineering
[email protected] -- +49-151-27565287
On Mon, Jun 28, 2021 at 07:12:40PM +0200, Enrico Weigelt, metux IT consult wrote:
> On 26.06.21 08:30, [email protected] wrote:
>
> Hi,
>
> > From: Long Li <[email protected]>
> >
> > Azure Blob storage provides scalable and durable data storage for Azure.
> > (https://azure.microsoft.com/en-us/services/storage/blobs/)
>
> It's a bit hard to quickly find out the exact semantics of that (w/o fully
> reading the library code ...). :(
>
> > This driver adds support for accelerated access to Azure Blob storage. As an
> > alternative to REST APIs, it provides a fast data path that uses host native
> > network stack and secure direct data link for storage server access.
>
> I don't think that yet another misc device with some very specific API
> is a good thing. We should have a more generic userland interface.
>
> Maybe a file system ?
This can already be done if I'm not mistaken:
https://docs.microsoft.com/en-us/azure/storage/blobs/storage-how-to-mount-container-linux
https://docs.microsoft.com/en-us/azure/storage/blobs/network-file-system-protocol-support-how-to
>
> OTOH, I've been considering adding other kind of blob stores (eg. venti
> and similar things) and inventing an own subsys for that, so we'd have
> some more universal interface. Unfortunately always been busy with other
> things.
>
> Maybe we should go that route instead of introducing yet another device
> specific uapi.
This work of course does not preclude anyone from going that route.
Wei.
>
>
> --mtx
>
> --
> ---
> Hinweis: unverschl?sselte E-Mails k?nnen leicht abgeh?rt und manipuliert
> werden ! F?r eine vertrauliche Kommunikation senden Sie bitte ihren
> GPG/PGP-Schl?ssel zu.
> ---
> Enrico Weigelt, metux IT consult
> Free software and Linux embedded engineering
> [email protected] -- +49-151-27565287
From: [email protected] <[email protected]> Sent: Friday, June 25, 2021 11:30 PM
>
> Azure Blob storage provides scalable and durable data storage for Azure.
> (https://azure.microsoft.com/en-us/services/storage/blobs/)
>
> This driver adds support for accelerated access to Azure Blob storage. As an
> alternative to REST APIs, it provides a fast data path that uses host native
> network stack and secure direct data link for storage server access.
>
> Cc: Jonathan Corbet <[email protected]>
> Cc: "K. Y. Srinivasan" <[email protected]>
> Cc: Haiyang Zhang <[email protected]>
> Cc: Stephen Hemminger <[email protected]>
> Cc: Wei Liu <[email protected]>
> Cc: Dexuan Cui <[email protected]>
> Cc: Greg Kroah-Hartman <[email protected]>
> Cc: Bjorn Andersson <[email protected]>
> Cc: Hans de Goede <[email protected]>
> Cc: Dan Williams <[email protected]>
> Cc: Maximilian Luz <[email protected]>
> Cc: Mike Rapoport <[email protected]>
> Cc: Ben Widawsky <[email protected]>
> Cc: Jiri Slaby <[email protected]>
> Cc: Andra Paraschiv <[email protected]>
> Cc: Siddharth Gupta <[email protected]>
> Cc: Hannes Reinecke <[email protected]>
> Cc: [email protected]
> Signed-off-by: Long Li <[email protected]>
> ---
> Documentation/userspace-api/ioctl/ioctl-number.rst | 2 +
> drivers/hv/Kconfig | 10 +
> drivers/hv/Makefile | 1 +
> drivers/hv/azure_blob.c | 655 +++++++++++++++++++++
> drivers/hv/channel_mgmt.c | 7 +
> include/linux/hyperv.h | 9 +
> include/uapi/misc/azure_blob.h | 31 +
> 7 files changed, 715 insertions(+)
> create mode 100644 drivers/hv/azure_blob.c
> create mode 100644 include/uapi/misc/azure_blob.h
>
> diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
> index 9bfc2b5..d3c2a90 100644
> --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> @@ -180,6 +180,8 @@ Code Seq# Include File Comments
> 'R' 01 linux/rfkill.h conflict!
> 'R' C0-DF net/bluetooth/rfcomm.h
> 'R' E0 uapi/linux/fsl_mc.h
> +'R' F0-FF uapi/misc/azure_blob.h Microsoft Azure Blob driver
> + <mailto:[email protected]>
> 'S' all linux/cdrom.h conflict!
> 'S' 80-81 scsi/scsi_ioctl.h conflict!
> 'S' 82-FF scsi/scsi.h conflict!
> diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
> index 66c794d..e08b8d3 100644
> --- a/drivers/hv/Kconfig
> +++ b/drivers/hv/Kconfig
> @@ -27,4 +27,14 @@ config HYPERV_BALLOON
> help
> Select this option to enable Hyper-V Balloon driver.
>
> +config HYPERV_AZURE_BLOB
> + tristate "Microsoft Azure Blob driver"
> + depends on HYPERV && X86_64
> + help
> + Select this option to enable Microsoft Azure Blob driver.
> +
> + This driver supports accelerated Microsoft Azure Blob access.
> + To compile this driver as a module, choose M here. The module will be
> + called azure_blob.
> +
> endmenu
> diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
> index 94daf82..a322575 100644
> --- a/drivers/hv/Makefile
> +++ b/drivers/hv/Makefile
> @@ -2,6 +2,7 @@
> obj-$(CONFIG_HYPERV) += hv_vmbus.o
> obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
> obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
> +obj-$(CONFIG_HYPERV_AZURE_BLOB) += azure_blob.o
>
> CFLAGS_hv_trace.o = -I$(src)
> CFLAGS_hv_balloon.o = -I$(src)
> diff --git a/drivers/hv/azure_blob.c b/drivers/hv/azure_blob.c
> new file mode 100644
> index 0000000..6c8f957
> --- /dev/null
> +++ b/drivers/hv/azure_blob.c
> @@ -0,0 +1,655 @@
> +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
> +/* Copyright (c) 2021, Microsoft Corporation. */
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <linux/cred.h>
> +#include <linux/debugfs.h>
> +#include <linux/pagemap.h>
> +#include <linux/hyperv.h>
> +#include <linux/miscdevice.h>
> +#include <linux/uio.h>
> +#include <uapi/misc/azure_blob.h>
> +
> +struct az_blob_device {
> + struct hv_device *device;
> +
> + /* Opened files maintained by this device */
> + struct list_head file_list;
> + spinlock_t file_lock;
> + wait_queue_head_t file_wait;
> +
> + bool removing;
> +};
> +
> +/* VSP messages */
> +enum az_blob_vsp_request_type {
> + AZ_BLOB_DRIVER_REQUEST_FIRST = 0x100,
> + AZ_BLOB_DRIVER_USER_REQUEST = 0x100,
> + AZ_BLOB_DRIVER_REGISTER_BUFFER = 0x101,
> + AZ_BLOB_DRIVER_DEREGISTER_BUFFER = 0x102,
> +};
> +
> +/* VSC->VSP request */
> +struct az_blob_vsp_request {
> + u32 version;
> + u32 timeout_ms;
> + u32 data_buffer_offset;
> + u32 data_buffer_length;
> + u32 data_buffer_valid;
> + u32 operation_type;
> + u32 request_buffer_offset;
> + u32 request_buffer_length;
> + u32 response_buffer_offset;
> + u32 response_buffer_length;
> + guid_t transaction_id;
> +} __packed;
> +
> +/* VSP->VSC response */
> +struct az_blob_vsp_response {
> + u32 length;
> + u32 error;
> + u32 response_len;
> +} __packed;
> +
> +struct az_blob_vsp_request_ctx {
> + struct list_head list;
> + struct completion wait_vsp;
> + struct az_blob_request_sync *request;
> +};
> +
> +struct az_blob_file_ctx {
> + struct list_head list;
> +
> + /* List of pending requests to VSP */
> + struct list_head vsp_pending_requests;
> + spinlock_t vsp_pending_lock;
> + wait_queue_head_t wait_vsp_pending;
> +};
> +
> +/* The maximum number of pages we can pass to VSP in a single packet */
> +#define AZ_BLOB_MAX_PAGES 8192
> +
> +#ifdef CONFIG_DEBUG_FS
> +struct dentry *az_blob_debugfs_root;
> +#endif
> +
> +static struct az_blob_device az_blob_dev;
> +
> +static int az_blob_ringbuffer_size = (128 * 1024);
> +module_param(az_blob_ringbuffer_size, int, 0444);
> +MODULE_PARM_DESC(az_blob_ringbuffer_size, "Ring buffer size (bytes)");
> +
> +static const struct hv_vmbus_device_id id_table[] = {
> + { HV_AZURE_BLOB_GUID,
> + .driver_data = 0
> + },
> + { },
> +};
> +
> +#define AZ_ERR 0
> +#define AZ_WARN 1
> +#define AZ_DBG 2
> +static int log_level = AZ_ERR;
> +module_param(log_level, int, 0644);
> +MODULE_PARM_DESC(log_level,
> + "Log level: 0 - Error (default), 1 - Warning, 2 - Debug.");
> +
> +static uint device_queue_depth = 1024;
> +module_param(device_queue_depth, uint, 0444);
> +MODULE_PARM_DESC(device_queue_depth,
> + "System level max queue depth for this device");
> +
> +#define az_blob_log(level, fmt, args...) \
> +do { \
> + if (level <= log_level) \
> + pr_err("%s:%d " fmt, __func__, __LINE__, ##args); \
> +} while (0)
> +
> +#define az_blob_dbg(fmt, args...) az_blob_log(AZ_DBG, fmt, ##args)
> +#define az_blob_warn(fmt, args...) az_blob_log(AZ_WARN, fmt, ##args)
> +#define az_blob_err(fmt, args...) az_blob_log(AZ_ERR, fmt, ##args)
> +
> +static void az_blob_on_channel_callback(void *context)
> +{
> + struct vmbus_channel *channel = (struct vmbus_channel *)context;
> + const struct vmpacket_descriptor *desc;
> +
> + az_blob_dbg("entering interrupt from vmbus\n");
> + foreach_vmbus_pkt(desc, channel) {
> + struct az_blob_vsp_request_ctx *request_ctx;
> + struct az_blob_vsp_response *response;
> + u64 cmd_rqst = vmbus_request_addr(&channel->requestor,
> + desc->trans_id);
> + if (cmd_rqst == VMBUS_RQST_ERROR) {
> + az_blob_err("incorrect transaction id %llu\n",
> + desc->trans_id);
> + continue;
> + }
> + request_ctx = (struct az_blob_vsp_request_ctx *) cmd_rqst;
> + response = hv_pkt_data(desc);
> +
> + az_blob_dbg("got response for request %pUb status %u "
> + "response_len %u\n",
> + &request_ctx->request->guid, response->error,
> + response->response_len);
> + request_ctx->request->response.status = response->error;
> + request_ctx->request->response.response_len =
> + response->response_len;
> + complete(&request_ctx->wait_vsp);
> + }
> +
> +}
> +
> +static int az_blob_fop_open(struct inode *inode, struct file *file)
> +{
> + struct az_blob_file_ctx *file_ctx;
> + unsigned long flags;
> +
> +
> + file_ctx = kzalloc(sizeof(*file_ctx), GFP_KERNEL);
> + if (!file_ctx)
> + return -ENOMEM;
> +
> +
> + spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> + if (az_blob_dev.removing) {
> + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
> + kfree(file_ctx);
> + return -ENODEV;
> + }
> + list_add_tail(&file_ctx->list, &az_blob_dev.file_list);
> + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
> +
> + INIT_LIST_HEAD(&file_ctx->vsp_pending_requests);
> + init_waitqueue_head(&file_ctx->wait_vsp_pending);
It feels a little weird to be initializing fields in the file_ctx
*after* it has been added to the file list. But I guess it's
OK since no requests can be issued and the file can't be
closed until after this open call completes.
> + file->private_data = file_ctx;
> +
> + return 0;
> +}
> +
> +static int az_blob_fop_release(struct inode *inode, struct file *file)
> +{
> + struct az_blob_file_ctx *file_ctx = file->private_data;
> + unsigned long flags;
> +
> + wait_event(file_ctx->wait_vsp_pending,
> + list_empty(&file_ctx->vsp_pending_requests));
> +
> + spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> + list_del(&file_ctx->list);
> + if (list_empty(&az_blob_dev.file_list))
> + wake_up(&az_blob_dev.file_wait);
> + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
> +
> + return 0;
> +}
> +
> +static inline bool az_blob_safe_file_access(struct file *file)
> +{
> + return file->f_cred == current_cred() && !uaccess_kernel();
> +}
> +
> +static int get_buffer_pages(int rw, void __user *buffer, u32 buffer_len,
> + struct page ***ppages, size_t *start, size_t *num_pages)
> +{
> + struct iovec iov;
> + struct iov_iter iter;
> + int ret;
> + ssize_t result;
> + struct page **pages;
> +
> + ret = import_single_range(rw, buffer, buffer_len, &iov, &iter);
> + if (ret) {
> + az_blob_dbg("request buffer access error %d\n", ret);
> + return ret;
> + }
> + az_blob_dbg("iov_iter type %d offset %lu count %lu nr_segs %lu\n",
> + iter.type, iter.iov_offset, iter.count, iter.nr_segs);
> +
> + result = iov_iter_get_pages_alloc(&iter, &pages, buffer_len, start);
> + if (result < 0) {
> + az_blob_dbg("failed to pin user pages result=%ld\n", result);
> + return result;
> + }
> + if (result != buffer_len) {
> + az_blob_dbg("can't pin user pages requested %d got %ld\n",
> + buffer_len, result);
> + return -EFAULT;
> + }
> +
> + *ppages = pages;
> + *num_pages = (result + *start + PAGE_SIZE - 1) / PAGE_SIZE;
> + return 0;
> +}
> +
> +static void fill_in_page_buffer(u64 *pfn_array,
> + int *index, struct page **pages, unsigned long num_pages)
> +{
> + int i, page_idx = *index;
> +
> + for (i = 0; i < num_pages; i++)
> + pfn_array[page_idx++] = page_to_pfn(pages[i]);
> + *index = page_idx;
> +}
> +
> +static void free_buffer_pages(size_t num_pages, struct page **pages)
> +{
> + unsigned long i;
> +
> + for (i = 0; i < num_pages; i++)
> + if (pages[i])
> + put_page(pages[i]);
> + kvfree(pages);
> +}
> +
> +static long az_blob_ioctl_user_request(struct file *filp, unsigned long arg)
> +{
> + struct az_blob_device *dev = &az_blob_dev;
> + struct az_blob_file_ctx *file_ctx = filp->private_data;
> + char __user *argp = (char __user *) arg;
> + struct az_blob_request_sync request;
> + struct az_blob_vsp_request_ctx request_ctx;
> + unsigned long flags;
> + int ret;
> + size_t request_start, request_num_pages = 0;
> + size_t response_start, response_num_pages = 0;
> + size_t data_start, data_num_pages = 0, total_num_pages;
> + struct page **request_pages = NULL, **response_pages = NULL;
> + struct page **data_pages = NULL;
> + struct vmbus_packet_mpb_array *desc;
> + u64 *pfn_array;
> + int desc_size;
> + int page_idx;
> + struct az_blob_vsp_request *vsp_request;
> +
> + /* Fast fail if device is being removed */
> + if (dev->removing)
> + return -ENODEV;
> +
> + if (!az_blob_safe_file_access(filp)) {
> + az_blob_dbg("process %d(%s) changed security contexts after"
> + " opening file descriptor\n",
> + task_tgid_vnr(current), current->comm);
> + return -EACCES;
> + }
> +
> + if (copy_from_user(&request, argp, sizeof(request))) {
> + az_blob_dbg("don't have permission to user provided buffer\n");
> + return -EFAULT;
> + }
> +
> + az_blob_dbg("az_blob ioctl request guid %pUb timeout %u request_len %u"
> + " response_len %u data_len %u request_buffer %llx "
> + "response_buffer %llx data_buffer %llx\n",
> + &request.guid, request.timeout, request.request_len,
> + request.response_len, request.data_len, request.request_buffer,
> + request.response_buffer, request.data_buffer);
> +
> + if (!request.request_len || !request.response_len)
> + return -EINVAL;
> +
> + if (request.data_len && request.data_len < request.data_valid)
> + return -EINVAL;
> +
> + init_completion(&request_ctx.wait_vsp);
> + request_ctx.request = &request;
> +
> + /*
> + * Need to set rw to READ to have page table set up for passing to VSP.
> + * Setting it to WRITE will cause the page table entry not allocated
> + * as it's waiting on Copy-On-Write on next page fault. This doesn't
> + * work in this scenario because VSP wants all the pages to be present.
> + */
> + ret = get_buffer_pages(READ, (void __user *) request.request_buffer,
> + request.request_len, &request_pages, &request_start,
> + &request_num_pages);
> + if (ret)
> + goto get_user_page_failed;
> +
> + ret = get_buffer_pages(READ, (void __user *) request.response_buffer,
> + request.response_len, &response_pages, &response_start,
> + &response_num_pages);
> + if (ret)
> + goto get_user_page_failed;
> +
> + if (request.data_len) {
> + ret = get_buffer_pages(READ,
> + (void __user *) request.data_buffer, request.data_len,
> + &data_pages, &data_start, &data_num_pages);
> + if (ret)
> + goto get_user_page_failed;
> + }
It still seems to me that request.request_len, request.response_len, and
request.data_len should be validated *before* get_buffer_pages() is called.
While the total number of pages that pinned is validated below against
AZ_BLOB_MAX_PAGES, bad values in the *_len fields could pin up to
12 Gbytes of memory before the check against AZ_BLOB_MAX_PAGES
happens. The pre-validation would make sure that each *_len value
is less than AZ_BLOB_MAX_PAGES, so that the max amount of memory
that could get pinned is 96 Mbytes.
Also, are the *_len values required to be a multiple of PAGE_SIZE?
I'm assuming not, and that any number of bytes is acceptable. If there
is some requirement such as being a multiple of 512 or something like
that, it should also be part of the pre-validation.
> +
> + total_num_pages = request_num_pages + response_num_pages +
> + data_num_pages;
> + if (total_num_pages > AZ_BLOB_MAX_PAGES) {
> + az_blob_dbg("number of DMA pages %lu buffer exceeding %u\n",
> + total_num_pages, AZ_BLOB_MAX_PAGES);
> + ret = -EINVAL;
> + goto get_user_page_failed;
> + }
> +
> + /* Construct a VMBUS packet and send it over to VSP */
> + desc_size = sizeof(struct vmbus_packet_mpb_array) +
> + sizeof(u64) * total_num_pages;
> + desc = kzalloc(desc_size, GFP_KERNEL);
> + vsp_request = kzalloc(sizeof(*vsp_request), GFP_KERNEL);
> + if (!desc || !vsp_request) {
> + kfree(desc);
> + kfree(vsp_request);
> + ret = -ENOMEM;
> + goto get_user_page_failed;
> + }
> +
> + desc->range.offset = 0;
> + desc->range.len = total_num_pages * PAGE_SIZE;
The range.len value is always a multiple of PAGE_SIZE. Assuming that
the request.*_len values are not required to be multiples of PAGE_SIZE,
then the sum of the buffer_length fields in the vsp_request may not
add up to range.len. Presumably the VSP is OK with that ....
> + pfn_array = desc->range.pfn_array;
> + page_idx = 0;
> +
> + if (request.data_len) {
> + fill_in_page_buffer(pfn_array, &page_idx, data_pages,
> + data_num_pages);
> + vsp_request->data_buffer_offset = data_start;
> + vsp_request->data_buffer_length = request.data_len;
> + vsp_request->data_buffer_valid = request.data_valid;
> + }
> +
> + fill_in_page_buffer(pfn_array, &page_idx, request_pages,
> + request_num_pages);
> + vsp_request->request_buffer_offset = request_start +
> + data_num_pages * PAGE_SIZE;
> + vsp_request->request_buffer_length = request.request_len;
> +
> + fill_in_page_buffer(pfn_array, &page_idx, response_pages,
> + response_num_pages);
> + vsp_request->response_buffer_offset = response_start +
> + (data_num_pages + request_num_pages) * PAGE_SIZE;
> + vsp_request->response_buffer_length = request.response_len;
Interestingly, if any of the data or request or response are adjacent in user
memory, but the boundary is not on a page boundary, the same page
may appear in two successive pfn_array entries. Again, I'm assuming that's
OK with the VSP. It appears that the VSP just treats the three memory
areas as completely independent of each other but defined by sequential
entries in the pfn_array.
> +
> + vsp_request->version = 0;
> + vsp_request->timeout_ms = request.timeout;
> + vsp_request->operation_type = AZ_BLOB_DRIVER_USER_REQUEST;
> + guid_copy(&vsp_request->transaction_id, &request.guid);
> +
> + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> + list_add_tail(&request_ctx.list, &file_ctx->vsp_pending_requests);
> + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
> +
> + az_blob_dbg("sending request to VSP\n");
> + az_blob_dbg("desc_size %u desc->range.len %u desc->range.offset %u\n",
> + desc_size, desc->range.len, desc->range.offset);
> + az_blob_dbg("vsp_request data_buffer_offset %u data_buffer_length %u "
> + "data_buffer_valid %u request_buffer_offset %u "
> + "request_buffer_length %u response_buffer_offset %u "
> + "response_buffer_length %u\n",
> + vsp_request->data_buffer_offset,
> + vsp_request->data_buffer_length,
> + vsp_request->data_buffer_valid,
> + vsp_request->request_buffer_offset,
> + vsp_request->request_buffer_length,
> + vsp_request->response_buffer_offset,
> + vsp_request->response_buffer_length);
> +
> + ret = vmbus_sendpacket_mpb_desc(dev->device->channel, desc, desc_size,
> + vsp_request, sizeof(*vsp_request), (u64) &request_ctx);
> +
> + kfree(desc);
> + kfree(vsp_request);
> + if (ret)
> + goto vmbus_send_failed;
> +
> + wait_for_completion(&request_ctx.wait_vsp);
> +
> + /*
> + * At this point, the response is already written to request
> + * by VMBUS completion handler, copy them to user-mode buffers
> + * and return to user-mode
> + */
> + if (copy_to_user(argp +
> + offsetof(struct az_blob_request_sync,
> + response.status),
> + &request.response.status,
> + sizeof(request.response.status))) {
> + ret = -EFAULT;
> + goto vmbus_send_failed;
> + }
> +
> + if (copy_to_user(argp +
> + offsetof(struct az_blob_request_sync,
> + response.response_len),
> + &request.response.response_len,
> + sizeof(request.response.response_len)))
> + ret = -EFAULT;
> +
> +vmbus_send_failed:
> + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> + list_del(&request_ctx.list);
> + if (list_empty(&file_ctx->vsp_pending_requests))
> + wake_up(&file_ctx->wait_vsp_pending);
> + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
> +
> +get_user_page_failed:
> + free_buffer_pages(request_num_pages, request_pages);
> + free_buffer_pages(response_num_pages, response_pages);
> + free_buffer_pages(data_num_pages, data_pages);
> +
> + return ret;
> +}
> +
> +static long az_blob_fop_ioctl(struct file *filp, unsigned int cmd,
> + unsigned long arg)
> +{
> + long ret = -EIO;
> +
> + switch (cmd) {
> + case IOCTL_AZ_BLOB_DRIVER_USER_REQUEST:
> + if (_IOC_SIZE(cmd) != sizeof(struct az_blob_request_sync))
> + return -EINVAL;
> + ret = az_blob_ioctl_user_request(filp, arg);
> + break;
> +
> + default:
> + az_blob_dbg("unrecognized IOCTL code %u\n", cmd);
> + }
> +
> + return ret;
> +}
> +
> +static const struct file_operations az_blob_client_fops = {
> + .owner = THIS_MODULE,
> + .open = az_blob_fop_open,
> + .unlocked_ioctl = az_blob_fop_ioctl,
> + .release = az_blob_fop_release,
> +};
> +
> +static struct miscdevice az_blob_misc_device = {
> + MISC_DYNAMIC_MINOR,
> + "azure_blob",
> + &az_blob_client_fops,
> +};
> +
> +static int az_blob_show_pending_requests(struct seq_file *m, void *v)
> +{
> + unsigned long flags, flags2;
> + struct az_blob_vsp_request_ctx *request_ctx;
> + struct az_blob_file_ctx *file_ctx;
> +
> + seq_puts(m, "List of pending requests\n");
> + seq_puts(m, "UUID request_len response_len data_len "
> + "request_buffer response_buffer data_buffer\n");
> + spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> + list_for_each_entry(file_ctx, &az_blob_dev.file_list, list) {
> + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags2);
> + list_for_each_entry(request_ctx,
> + &file_ctx->vsp_pending_requests, list) {
> + seq_printf(m, "%pUb ", &request_ctx->request->guid);
> + seq_printf(m, "%u ", request_ctx->request->request_len);
> + seq_printf(m,
> + "%u ", request_ctx->request->response_len);
> + seq_printf(m, "%u ", request_ctx->request->data_len);
> + seq_printf(m,
> + "%llx ", request_ctx->request->request_buffer);
> + seq_printf(m,
> + "%llx ", request_ctx->request->response_buffer);
> + seq_printf(m,
> + "%llx\n", request_ctx->request->data_buffer);
> + }
> + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags2);
> + }
> + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
> +
> + return 0;
> +}
> +
> +static int az_blob_debugfs_open(struct inode *inode, struct file *file)
> +{
> + return single_open(file, az_blob_show_pending_requests, NULL);
> +}
> +
> +static const struct file_operations az_blob_debugfs_fops = {
> + .open = az_blob_debugfs_open,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .release = seq_release
> +};
> +
> +static void az_blob_remove_device(struct az_blob_device *dev)
> +{
> + wait_event(dev->file_wait, list_empty(&dev->file_list));
> + misc_deregister(&az_blob_misc_device);
> +#ifdef CONFIG_DEBUG_FS
> + debugfs_remove_recursive(az_blob_debugfs_root);
> +#endif
> + /* At this point, we won't get any requests from user-mode */
> +}
> +
> +static int az_blob_create_device(struct az_blob_device *dev)
> +{
> + int rc;
> + struct dentry *d;
> +
> + rc = misc_register(&az_blob_misc_device);
> + if (rc) {
> + az_blob_err("misc_register failed rc %d\n", rc);
> + return rc;
> + }
> +
> +#ifdef CONFIG_DEBUG_FS
> + az_blob_debugfs_root = debugfs_create_dir("az_blob", NULL);
> + if (!IS_ERR_OR_NULL(az_blob_debugfs_root)) {
> + d = debugfs_create_file("pending_requests", 0400,
> + az_blob_debugfs_root, NULL,
> + &az_blob_debugfs_fops);
> + if (IS_ERR_OR_NULL(d)) {
> + az_blob_warn("failed to create debugfs file\n");
> + debugfs_remove_recursive(az_blob_debugfs_root);
> + az_blob_debugfs_root = NULL;
> + }
> + } else
> + az_blob_warn("failed to create debugfs root\n");
> +#endif
> +
> + return 0;
> +}
> +
> +static int az_blob_connect_to_vsp(struct hv_device *device, u32 ring_size)
> +{
> + int ret;
> +
> + spin_lock_init(&az_blob_dev.file_lock);
I'd argue that the spin lock should not be re-initialized here. Here's
the sequence where things go wrong:
1) The driver is unbound, so az_blob_remove() is called.
2) az_blob_remove() sets the "removing" flag to true, and calls
az_blob_remove_device().
3) az_blob_remove_device() waits for the file_list to become empty.
4) After the file_list becomes empty, but before misc_deregister()
is called, a separate thread opens the device again.
5) In the separate thread, az_blob_fop_open() obtains the file_lock
spin lock.
6) Before az_blob_fop_open() releases the spin lock, az
block_remove_device() completes, along with az_blob_remove().
7) Then the device gets rebound, and az_blob_connect_to_vsp()
gets called, all while az_blob_fop_open() still holds the spin
lock. So the spin lock get re-initialized while it is held.
This is admittedly a far-fetched scenario, but stranger things
have happened. :-) The issue is that you are counting on
the az_blob_dev structure to persist and have a valid file_lock,
even while the device is unbound. So any initialization
should only happen in az_blob_drv_init().
> + INIT_LIST_HEAD(&az_blob_dev.file_list);
> + init_waitqueue_head(&az_blob_dev.file_wait);
> +
> + az_blob_dev.removing = false;
> +
> + az_blob_dev.device = device;
> + device->channel->rqstor_size = device_queue_depth;
> +
> + ret = vmbus_open(device->channel, ring_size, ring_size, NULL, 0,
> + az_blob_on_channel_callback, device->channel);
> +
> + if (ret) {
> + az_blob_err("failed to connect to VSP ret %d\n", ret);
> + return ret;
> + }
> +
> + hv_set_drvdata(device, &az_blob_dev);
> +
> + return ret;
> +}
> +
> +static void az_blob_remove_vmbus(struct hv_device *device)
> +{
> + /* At this point, no VSC/VSP traffic is possible over vmbus */
> + hv_set_drvdata(device, NULL);
> + vmbus_close(device->channel);
> +}
> +
> +static int az_blob_probe(struct hv_device *device,
> + const struct hv_vmbus_device_id *dev_id)
> +{
> + int rc;
> +
> + az_blob_dbg("probing device\n");
> +
> + rc = az_blob_connect_to_vsp(device, az_blob_ringbuffer_size);
> + if (rc) {
> + az_blob_err("error connecting to VSP rc %d\n", rc);
> + return rc;
> + }
> +
> + // create user-mode client library facing device
> + rc = az_blob_create_device(&az_blob_dev);
> + if (rc) {
> + az_blob_remove_vmbus(device);
> + return rc;
> + }
> +
> + az_blob_dbg("successfully probed device\n");
> + return 0;
> +}
> +
> +static int az_blob_remove(struct hv_device *dev)
> +{
> + struct az_blob_device *device = hv_get_drvdata(dev);
> + unsigned long flags;
> +
> + spin_lock_irqsave(&device->file_lock, flags);
> + device->removing = true;
> + spin_unlock_irqrestore(&device->file_lock, flags);
> +
> + az_blob_remove_device(device);
> + az_blob_remove_vmbus(dev);
> + return 0;
> +}
> +
> +static struct hv_driver az_blob_drv = {
> + .name = KBUILD_MODNAME,
> + .id_table = id_table,
> + .probe = az_blob_probe,
> + .remove = az_blob_remove,
> + .driver = {
> + .probe_type = PROBE_PREFER_ASYNCHRONOUS,
> + },
> +};
> +
> +static int __init az_blob_drv_init(void)
> +{
> + int ret;
> +
> + ret = vmbus_driver_register(&az_blob_drv);
> + return ret;
> +}
> +
> +static void __exit az_blob_drv_exit(void)
> +{
> + vmbus_driver_unregister(&az_blob_drv);
> +}
> +
> +MODULE_LICENSE("Dual BSD/GPL");
> +MODULE_DESCRIPTION("Microsoft Azure Blob driver");
> +module_init(az_blob_drv_init);
> +module_exit(az_blob_drv_exit);
> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> index 0c75662..436fdeb 100644
> --- a/drivers/hv/channel_mgmt.c
> +++ b/drivers/hv/channel_mgmt.c
> @@ -154,6 +154,13 @@
> .allowed_in_isolated = false,
> },
>
> + /* Azure Blob */
> + { .dev_type = HV_AZURE_BLOB,
> + HV_AZURE_BLOB_GUID,
> + .perf_device = false,
> + .allowed_in_isolated = false,
> + },
> +
> /* Unknown GUID */
> { .dev_type = HV_UNKNOWN,
> .perf_device = false,
> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
> index d1e59db..ac31362 100644
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -772,6 +772,7 @@ enum vmbus_device_type {
> HV_FCOPY,
> HV_BACKUP,
> HV_DM,
> + HV_AZURE_BLOB,
> HV_UNKNOWN,
> };
>
> @@ -1350,6 +1351,14 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
> 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f)
>
> /*
> + * Azure Blob GUID
> + * {0590b792-db64-45cc-81db-b8d70c577c9e}
> + */
> +#define HV_AZURE_BLOB_GUID \
> + .guid = GUID_INIT(0x0590b792, 0xdb64, 0x45cc, 0x81, 0xdb, \
> + 0xb8, 0xd7, 0x0c, 0x57, 0x7c, 0x9e)
> +
> +/*
> * Shutdown GUID
> * {0e0b6031-5213-4934-818b-38d90ced39db}
> */
> diff --git a/include/uapi/misc/azure_blob.h b/include/uapi/misc/azure_blob.h
> new file mode 100644
> index 0000000..29413fc
> --- /dev/null
> +++ b/include/uapi/misc/azure_blob.h
> @@ -0,0 +1,31 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
> +/* Copyright (c) 2021, Microsoft Corporation. */
> +
> +#ifndef _AZ_BLOB_H
> +#define _AZ_BLOB_H
Seems like a #include of asm/ioctl.h (or something similar)
is needed so that _IOWR is defined. Also, a #include
is needed for __u32, __aligned_u64, guid_t, etc.
> +
> +/* user-mode sync request sent through ioctl */
> +struct az_blob_request_sync_response {
> + __u32 status;
> + __u32 response_len;
> +};
> +
> +struct az_blob_request_sync {
> + guid_t guid;
> + __u32 timeout;
> + __u32 request_len;
> + __u32 response_len;
> + __u32 data_len;
> + __u32 data_valid;
> + __aligned_u64 request_buffer;
> + __aligned_u64 response_buffer;
> + __aligned_u64 data_buffer;
> + struct az_blob_request_sync_response response;
> +};
> +
> +#define AZ_BLOB_MAGIC_NUMBER 'R'
> +#define IOCTL_AZ_BLOB_DRIVER_USER_REQUEST \
> + _IOWR(AZ_BLOB_MAGIC_NUMBER, 0xf0, \
> + struct az_blob_request_sync)
> +
> +#endif /* define _AZ_BLOB_H */
> --
> 1.8.3.1
On Fri, Jun 25, 2021 at 11:30:19PM -0700, [email protected] wrote:
> +#ifdef CONFIG_DEBUG_FS
> +struct dentry *az_blob_debugfs_root;
> +#endif
No need to keep this dentry, just look it up if you need it.
> +
> +static struct az_blob_device az_blob_dev;
> +
> +static int az_blob_ringbuffer_size = (128 * 1024);
> +module_param(az_blob_ringbuffer_size, int, 0444);
> +MODULE_PARM_DESC(az_blob_ringbuffer_size, "Ring buffer size (bytes)");
This is NOT the 1990's, please do not create new module parameters.
Just make this work properly for everyone.
> +#define AZ_ERR 0
> +#define AZ_WARN 1
> +#define AZ_DBG 2
> +static int log_level = AZ_ERR;
> +module_param(log_level, int, 0644);
> +MODULE_PARM_DESC(log_level,
> + "Log level: 0 - Error (default), 1 - Warning, 2 - Debug.");
A single driver does not need a special debug/log level, use the
system-wide functions and all will "just work"
> +
> +static uint device_queue_depth = 1024;
> +module_param(device_queue_depth, uint, 0444);
> +MODULE_PARM_DESC(device_queue_depth,
> + "System level max queue depth for this device");
> +
> +#define az_blob_log(level, fmt, args...) \
> +do { \
> + if (level <= log_level) \
> + pr_err("%s:%d " fmt, __func__, __LINE__, ##args); \
> +} while (0)
> +
> +#define az_blob_dbg(fmt, args...) az_blob_log(AZ_DBG, fmt, ##args)
> +#define az_blob_warn(fmt, args...) az_blob_log(AZ_WARN, fmt, ##args)
> +#define az_blob_err(fmt, args...) az_blob_log(AZ_ERR, fmt, ##args)
Again, no.
Just use dev_dbg(), dev_warn(), and dev_err() and there is no need for
anything "special". This is just one tiny driver, do not rewrite logic
like this for no reason.
> +static void az_blob_remove_device(struct az_blob_device *dev)
> +{
> + wait_event(dev->file_wait, list_empty(&dev->file_list));
> + misc_deregister(&az_blob_misc_device);
> +#ifdef CONFIG_DEBUG_FS
No need for the #ifdef.
> + debugfs_remove_recursive(az_blob_debugfs_root);
> +#endif
> + /* At this point, we won't get any requests from user-mode */
> +}
> +
> +static int az_blob_create_device(struct az_blob_device *dev)
> +{
> + int rc;
> + struct dentry *d;
> +
> + rc = misc_register(&az_blob_misc_device);
> + if (rc) {
> + az_blob_err("misc_register failed rc %d\n", rc);
> + return rc;
> + }
> +
> +#ifdef CONFIG_DEBUG_FS
No need for the #ifdef
> + az_blob_debugfs_root = debugfs_create_dir("az_blob", NULL);
> + if (!IS_ERR_OR_NULL(az_blob_debugfs_root)) {
No need to check this.
> + d = debugfs_create_file("pending_requests", 0400,
> + az_blob_debugfs_root, NULL,
> + &az_blob_debugfs_fops);
> + if (IS_ERR_OR_NULL(d)) {
How can that be NULL?
No need to ever check any debugfs calls, please just make them and move
on.
thanks,
greg k-h
On Fri, Jun 25, 2021 at 11:30:19PM -0700, [email protected] wrote:
> @@ -0,0 +1,655 @@
> +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
If you _REALLY_ want to do this, please document _WHY_ you are doing
this in the changelog comment as dual-licensed Linux kernel code will
just cause you a lot of problems in the long run and we want to be sure
you understand all of the issues here and your managers agree with it.
thanks,
greg k-h
On 26. 06. 21, 8:30, [email protected] wrote:
> Azure Blob storage provides scalable and durable data storage for Azure.
> (https://azure.microsoft.com/en-us/services/storage/blobs/)
>
> This driver adds support for accelerated access to Azure Blob storage. As an
> alternative to REST APIs, it provides a fast data path that uses host native
> network stack and secure direct data link for storage server access.
>
...
> Cc: Jiri Slaby <[email protected]>
I am just curious, why am I CCed? Anyway, some comments below:
> --- /dev/null
> +++ b/drivers/hv/azure_blob.c
> @@ -0,0 +1,655 @@
...
> +static void az_blob_on_channel_callback(void *context)
> +{
> + struct vmbus_channel *channel = (struct vmbus_channel *)context;
Have you fed this patch through checkpatch?
> + const struct vmpacket_descriptor *desc;
> +
> + az_blob_dbg("entering interrupt from vmbus\n");
> + foreach_vmbus_pkt(desc, channel) {
> + struct az_blob_vsp_request_ctx *request_ctx;
> + struct az_blob_vsp_response *response;
> + u64 cmd_rqst = vmbus_request_addr(&channel->requestor,
> + desc->trans_id);
> + if (cmd_rqst == VMBUS_RQST_ERROR) {
> + az_blob_err("incorrect transaction id %llu\n",
> + desc->trans_id);
> + continue;
> + }
> + request_ctx = (struct az_blob_vsp_request_ctx *) cmd_rqst;
> + response = hv_pkt_data(desc);
> +
> + az_blob_dbg("got response for request %pUb status %u "
> + "response_len %u\n",
> + &request_ctx->request->guid, response->error,
> + response->response_len);
> + request_ctx->request->response.status = response->error;
> + request_ctx->request->response.response_len =
> + response->response_len;
> + complete(&request_ctx->wait_vsp);
> + }
> +
> +}
...
> +static long az_blob_ioctl_user_request(struct file *filp, unsigned long arg)
> +{
> + struct az_blob_device *dev = &az_blob_dev;
> + struct az_blob_file_ctx *file_ctx = filp->private_data;
> + char __user *argp = (char __user *) arg;
> + struct az_blob_request_sync request;
> + struct az_blob_vsp_request_ctx request_ctx;
> + unsigned long flags;
> + int ret;
> + size_t request_start, request_num_pages = 0;
> + size_t response_start, response_num_pages = 0;
> + size_t data_start, data_num_pages = 0, total_num_pages;
> + struct page **request_pages = NULL, **response_pages = NULL;
> + struct page **data_pages = NULL;
> + struct vmbus_packet_mpb_array *desc;
> + u64 *pfn_array;
> + int desc_size;
> + int page_idx;
> + struct az_blob_vsp_request *vsp_request;
Ugh, see further.
> +
> + /* Fast fail if device is being removed */
> + if (dev->removing)
> + return -ENODEV;
> +
> + if (!az_blob_safe_file_access(filp)) {
> + az_blob_dbg("process %d(%s) changed security contexts after"
> + " opening file descriptor\n",
> + task_tgid_vnr(current), current->comm);
> + return -EACCES;
> + }
> +
> + if (copy_from_user(&request, argp, sizeof(request))) {
> + az_blob_dbg("don't have permission to user provided buffer\n");
> + return -EFAULT;
> + }
> +
> + az_blob_dbg("az_blob ioctl request guid %pUb timeout %u request_len %u"
> + " response_len %u data_len %u request_buffer %llx "
> + "response_buffer %llx data_buffer %llx\n",
> + &request.guid, request.timeout, request.request_len,
> + request.response_len, request.data_len, request.request_buffer,
> + request.response_buffer, request.data_buffer);
> +
> + if (!request.request_len || !request.response_len)
> + return -EINVAL;
> +
> + if (request.data_len && request.data_len < request.data_valid)
> + return -EINVAL;
> +
> + init_completion(&request_ctx.wait_vsp);
> + request_ctx.request = &request;
> +
> + /*
> + * Need to set rw to READ to have page table set up for passing to VSP.
> + * Setting it to WRITE will cause the page table entry not allocated
> + * as it's waiting on Copy-On-Write on next page fault. This doesn't
> + * work in this scenario because VSP wants all the pages to be present.
> + */
> + ret = get_buffer_pages(READ, (void __user *) request.request_buffer,
Does this need __force for sparse not to complain?
> + request.request_len, &request_pages, &request_start,
> + &request_num_pages);
> + if (ret)
> + goto get_user_page_failed;
> +
> + ret = get_buffer_pages(READ, (void __user *) request.response_buffer,
> + request.response_len, &response_pages, &response_start,
> + &response_num_pages);
> + if (ret)
> + goto get_user_page_failed;
> +
> + if (request.data_len) {
> + ret = get_buffer_pages(READ,
> + (void __user *) request.data_buffer, request.data_len,
> + &data_pages, &data_start, &data_num_pages);
> + if (ret)
> + goto get_user_page_failed;
> + }
> +
> + total_num_pages = request_num_pages + response_num_pages +
> + data_num_pages;
> + if (total_num_pages > AZ_BLOB_MAX_PAGES) {
> + az_blob_dbg("number of DMA pages %lu buffer exceeding %u\n",
> + total_num_pages, AZ_BLOB_MAX_PAGES);
> + ret = -EINVAL;
> + goto get_user_page_failed;
> + }
> +
> + /* Construct a VMBUS packet and send it over to VSP */
> + desc_size = sizeof(struct vmbus_packet_mpb_array) +
> + sizeof(u64) * total_num_pages;
> + desc = kzalloc(desc_size, GFP_KERNEL);
Smells like a call for struct_size().
> + vsp_request = kzalloc(sizeof(*vsp_request), GFP_KERNEL);
> + if (!desc || !vsp_request) {
> + kfree(desc);
> + kfree(vsp_request);
> + ret = -ENOMEM;
> + goto get_user_page_failed;
> + }
> +
> + desc->range.offset = 0;
> + desc->range.len = total_num_pages * PAGE_SIZE;
> + pfn_array = desc->range.pfn_array;
> + page_idx = 0;
> +
> + if (request.data_len) {
> + fill_in_page_buffer(pfn_array, &page_idx, data_pages,
> + data_num_pages);
> + vsp_request->data_buffer_offset = data_start;
> + vsp_request->data_buffer_length = request.data_len;
> + vsp_request->data_buffer_valid = request.data_valid;
> + }
> +
> + fill_in_page_buffer(pfn_array, &page_idx, request_pages,
> + request_num_pages);
> + vsp_request->request_buffer_offset = request_start +
> + data_num_pages * PAGE_SIZE;
> + vsp_request->request_buffer_length = request.request_len;
> +
> + fill_in_page_buffer(pfn_array, &page_idx, response_pages,
> + response_num_pages);
> + vsp_request->response_buffer_offset = response_start +
> + (data_num_pages + request_num_pages) * PAGE_SIZE;
> + vsp_request->response_buffer_length = request.response_len;
> +
> + vsp_request->version = 0;
> + vsp_request->timeout_ms = request.timeout;
> + vsp_request->operation_type = AZ_BLOB_DRIVER_USER_REQUEST;
> + guid_copy(&vsp_request->transaction_id, &request.guid);
> +
> + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> + list_add_tail(&request_ctx.list, &file_ctx->vsp_pending_requests);
> + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
> +
> + az_blob_dbg("sending request to VSP\n");
> + az_blob_dbg("desc_size %u desc->range.len %u desc->range.offset %u\n",
> + desc_size, desc->range.len, desc->range.offset);
> + az_blob_dbg("vsp_request data_buffer_offset %u data_buffer_length %u "
> + "data_buffer_valid %u request_buffer_offset %u "
> + "request_buffer_length %u response_buffer_offset %u "
> + "response_buffer_length %u\n",
> + vsp_request->data_buffer_offset,
> + vsp_request->data_buffer_length,
> + vsp_request->data_buffer_valid,
> + vsp_request->request_buffer_offset,
> + vsp_request->request_buffer_length,
> + vsp_request->response_buffer_offset,
> + vsp_request->response_buffer_length);
> +
> + ret = vmbus_sendpacket_mpb_desc(dev->device->channel, desc, desc_size,
> + vsp_request, sizeof(*vsp_request), (u64) &request_ctx);
> +
> + kfree(desc);
> + kfree(vsp_request);
> + if (ret)
> + goto vmbus_send_failed;
> +
> + wait_for_completion(&request_ctx.wait_vsp);
Provided this is ioctl, this should likely be interruptible. You don't
want to account to I/O load. The same likely for az_blob_fop_release.
> +
> + /*
> + * At this point, the response is already written to request
> + * by VMBUS completion handler, copy them to user-mode buffers
> + * and return to user-mode
> + */
> + if (copy_to_user(argp +
> + offsetof(struct az_blob_request_sync,
> + response.status),
> + &request.response.status,
This is ugly, why don't you make argp the appropriate pointer instead of
char *? You'd need not do copy_to_user twice then, right?
> + sizeof(request.response.status))) {
> + ret = -EFAULT;
> + goto vmbus_send_failed;
> + }
> +
> + if (copy_to_user(argp +
> + offsetof(struct az_blob_request_sync,
> + response.response_len),
The same here.
> + &request.response.response_len,
> + sizeof(request.response.response_len)))
> + ret = -EFAULT;
> +
> +vmbus_send_failed:
> + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> + list_del(&request_ctx.list);
> + if (list_empty(&file_ctx->vsp_pending_requests))
> + wake_up(&file_ctx->wait_vsp_pending);
> + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
> +
> +get_user_page_failed:
> + free_buffer_pages(request_num_pages, request_pages);
> + free_buffer_pages(response_num_pages, response_pages);
> + free_buffer_pages(data_num_pages, data_pages);
> +
> + return ret;
> +}
This function is overly long. Care to split it (e.g. moving away the
initialization of the structs and the debug stuff)?
> +
> +static long az_blob_fop_ioctl(struct file *filp, unsigned int cmd,
> + unsigned long arg)
> +{
> + long ret = -EIO;
EINVAL would be more appropriate.
> +
> + switch (cmd) {
> + case IOCTL_AZ_BLOB_DRIVER_USER_REQUEST:
> + if (_IOC_SIZE(cmd) != sizeof(struct az_blob_request_sync))
> + return -EINVAL;
How can that happen, provided the switch (cmd) and case?
> + ret = az_blob_ioctl_user_request(filp, arg);
> + break;
Simply:
return az_blob_ioctl_user_request(filp, arg);
> +
> + default:
> + az_blob_dbg("unrecognized IOCTL code %u\n", cmd);
> + }
> +
> + return ret;
So return -EINVAL here directly now.
> +}
...
> +static int az_blob_connect_to_vsp(struct hv_device *device, u32 ring_size)
> +{
> + int ret;
...
> + int rc;
Sometimes ret, sometimes rc, could you unify the two?
> +static int __init az_blob_drv_init(void)
> +{
> + int ret;
> +
> + ret = vmbus_driver_register(&az_blob_drv);
> + return ret;
Simply:
return vmbus_driver_register(&az_blob_drv);
regards,
--
--
js
suse labs
On 29.06.21 08:24, Greg Kroah-Hartman wrote:
Hi folks,
> Again, no.
>
> Just use dev_dbg(), dev_warn(), and dev_err() and there is no need for
> anything "special". This is just one tiny driver, do not rewrite logic
> like this for no reason.
Maybe worth mentioning that we have the pr_fmt macro that can be
defined if some wants to do things like adding some prefix.
> No need to ever check any debugfs calls, please just make them and move
> on.
In this case a failed attempt to create debugfs files/dirs should not be
treated as some actual error, but a normal situation, therefore there
shouldn't be an error message. (see debugfs.h)
--mtx
--
---
Hinweis: unverschlüsselte E-Mails können leicht abgehört und manipuliert
werden ! Für eine vertrauliche Kommunikation senden Sie bitte ihren
GPG/PGP-Schlüssel zu.
---
Enrico Weigelt, metux IT consult
Free software and Linux embedded engineering
[email protected] -- +49-151-27565287
On Tue, Jun 29, 2021 at 12:41:49PM +0200, Enrico Weigelt, metux IT consult wrote:
> On 29.06.21 08:24, Greg Kroah-Hartman wrote:
>
> Hi folks,
>
> > Again, no.
> >
> > Just use dev_dbg(), dev_warn(), and dev_err() and there is no need for
> > anything "special". This is just one tiny driver, do not rewrite logic
> > like this for no reason.
>
> Maybe worth mentioning that we have the pr_fmt macro that can be
> defined if some wants to do things like adding some prefix.
No, no driver should mess with that, just use dev_*() calls instead
please.
greg k-h
On 29.06.21 14:18, Greg Kroah-Hartman wrote:
> On Tue, Jun 29, 2021 at 12:41:49PM +0200, Enrico Weigelt, metux IT consult wrote:
>> On 29.06.21 08:24, Greg Kroah-Hartman wrote:
>>
>> Hi folks,
>>
>>> Again, no.
>>>
>>> Just use dev_dbg(), dev_warn(), and dev_err() and there is no need for
>>> anything "special". This is just one tiny driver, do not rewrite logic
>>> like this for no reason.
>>
>> Maybe worth mentioning that we have the pr_fmt macro that can be
>> defined if some wants to do things like adding some prefix.
>
> No, no driver should mess with that, just use dev_*() calls instead
> please.
Well, I've been referring to some scenario where you *really* want some
*additional* prefix over the whole file. But you're right, the good use
cases for that are very rare - dev_*() usually should already put enough
information into the message.
--mtx
--
---
Hinweis: unverschlüsselte E-Mails können leicht abgehört und manipuliert
werden ! Für eine vertrauliche Kommunikation senden Sie bitte ihren
GPG/PGP-Schlüssel zu.
---
Enrico Weigelt, metux IT consult
Free software and Linux embedded engineering
[email protected] -- +49-151-27565287
> Subject: RE: [Patch v2 2/3] Drivers: hv: add Azure Blob driver
>
> From: [email protected] <[email protected]> Sent: Friday,
> June 25, 2021 11:30 PM
> >
> > Azure Blob storage provides scalable and durable data storage for Azure.
> > (https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fazu
> > re.microsoft.com%2Fen-
> us%2Fservices%2Fstorage%2Fblobs%2F&data=04%7
> >
> C01%7Clongli%40microsoft.com%7Cd8cabb55b9f24e98fefe08d93a7651e2%
> 7C72f9
> >
> 88bf86f141af91ab2d7cd011db47%7C1%7C0%7C637605102139607593%7C
> Unknown%7C
> >
> TWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJ
> XVC
> >
> I6Mn0%3D%7C1000&sdata=GfpN0Gb5IkkdeSe4e0zaW3y7iRYAoRFVicYIy
> L4F9T8%
> > 3D&reserved=0)
> >
> > This driver adds support for accelerated access to Azure Blob storage.
> > As an alternative to REST APIs, it provides a fast data path that uses
> > host native network stack and secure direct data link for storage server
> access.
> >
> > Cc: Jonathan Corbet <[email protected]>
> > Cc: "K. Y. Srinivasan" <[email protected]>
> > Cc: Haiyang Zhang <[email protected]>
> > Cc: Stephen Hemminger <[email protected]>
> > Cc: Wei Liu <[email protected]>
> > Cc: Dexuan Cui <[email protected]>
> > Cc: Greg Kroah-Hartman <[email protected]>
> > Cc: Bjorn Andersson <[email protected]>
> > Cc: Hans de Goede <[email protected]>
> > Cc: Dan Williams <[email protected]>
> > Cc: Maximilian Luz <[email protected]>
> > Cc: Mike Rapoport <[email protected]>
> > Cc: Ben Widawsky <[email protected]>
> > Cc: Jiri Slaby <[email protected]>
> > Cc: Andra Paraschiv <[email protected]>
> > Cc: Siddharth Gupta <[email protected]>
> > Cc: Hannes Reinecke <[email protected]>
> > Cc: [email protected]
> > Signed-off-by: Long Li <[email protected]>
> > ---
> > Documentation/userspace-api/ioctl/ioctl-number.rst | 2 +
> > drivers/hv/Kconfig | 10 +
> > drivers/hv/Makefile | 1 +
> > drivers/hv/azure_blob.c | 655 +++++++++++++++++++++
> > drivers/hv/channel_mgmt.c | 7 +
> > include/linux/hyperv.h | 9 +
> > include/uapi/misc/azure_blob.h | 31 +
> > 7 files changed, 715 insertions(+)
> > create mode 100644 drivers/hv/azure_blob.c create mode 100644
> > include/uapi/misc/azure_blob.h
> >
> > diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst
> > b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > index 9bfc2b5..d3c2a90 100644
> > --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> > +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > @@ -180,6 +180,8 @@ Code Seq# Include File
> Comments
> > 'R' 01 linux/rfkill.h conflict!
> > 'R' C0-DF net/bluetooth/rfcomm.h
> > 'R' E0 uapi/linux/fsl_mc.h
> > +'R' F0-FF uapi/misc/azure_blob.h Microsoft Azure Blob
> driver
> > +
> > +<mailto:[email protected]>
> > 'S' all linux/cdrom.h conflict!
> > 'S' 80-81 scsi/scsi_ioctl.h conflict!
> > 'S' 82-FF scsi/scsi.h conflict!
> > diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index
> > 66c794d..e08b8d3 100644
> > --- a/drivers/hv/Kconfig
> > +++ b/drivers/hv/Kconfig
> > @@ -27,4 +27,14 @@ config HYPERV_BALLOON
> > help
> > Select this option to enable Hyper-V Balloon driver.
> >
> > +config HYPERV_AZURE_BLOB
> > + tristate "Microsoft Azure Blob driver"
> > + depends on HYPERV && X86_64
> > + help
> > + Select this option to enable Microsoft Azure Blob driver.
> > +
> > + This driver supports accelerated Microsoft Azure Blob access.
> > + To compile this driver as a module, choose M here. The module will
> be
> > + called azure_blob.
> > +
> > endmenu
> > diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index
> > 94daf82..a322575 100644
> > --- a/drivers/hv/Makefile
> > +++ b/drivers/hv/Makefile
> > @@ -2,6 +2,7 @@
> > obj-$(CONFIG_HYPERV) += hv_vmbus.o
> > obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
> > obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
> > +obj-$(CONFIG_HYPERV_AZURE_BLOB) += azure_blob.o
> >
> > CFLAGS_hv_trace.o = -I$(src)
> > CFLAGS_hv_balloon.o = -I$(src)
> > diff --git a/drivers/hv/azure_blob.c b/drivers/hv/azure_blob.c new
> > file mode 100644 index 0000000..6c8f957
> > --- /dev/null
> > +++ b/drivers/hv/azure_blob.c
> > @@ -0,0 +1,655 @@
> > +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
> > +/* Copyright (c) 2021, Microsoft Corporation. */
> > +
> > +#include <linux/kernel.h>
> > +#include <linux/module.h>
> > +#include <linux/device.h>
> > +#include <linux/slab.h>
> > +#include <linux/cred.h>
> > +#include <linux/debugfs.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/hyperv.h>
> > +#include <linux/miscdevice.h>
> > +#include <linux/uio.h>
> > +#include <uapi/misc/azure_blob.h>
> > +
> > +struct az_blob_device {
> > + struct hv_device *device;
> > +
> > + /* Opened files maintained by this device */
> > + struct list_head file_list;
> > + spinlock_t file_lock;
> > + wait_queue_head_t file_wait;
> > +
> > + bool removing;
> > +};
> > +
> > +/* VSP messages */
> > +enum az_blob_vsp_request_type {
> > + AZ_BLOB_DRIVER_REQUEST_FIRST = 0x100,
> > + AZ_BLOB_DRIVER_USER_REQUEST = 0x100,
> > + AZ_BLOB_DRIVER_REGISTER_BUFFER = 0x101,
> > + AZ_BLOB_DRIVER_DEREGISTER_BUFFER = 0x102, };
> > +
> > +/* VSC->VSP request */
> > +struct az_blob_vsp_request {
> > + u32 version;
> > + u32 timeout_ms;
> > + u32 data_buffer_offset;
> > + u32 data_buffer_length;
> > + u32 data_buffer_valid;
> > + u32 operation_type;
> > + u32 request_buffer_offset;
> > + u32 request_buffer_length;
> > + u32 response_buffer_offset;
> > + u32 response_buffer_length;
> > + guid_t transaction_id;
> > +} __packed;
> > +
> > +/* VSP->VSC response */
> > +struct az_blob_vsp_response {
> > + u32 length;
> > + u32 error;
> > + u32 response_len;
> > +} __packed;
> > +
> > +struct az_blob_vsp_request_ctx {
> > + struct list_head list;
> > + struct completion wait_vsp;
> > + struct az_blob_request_sync *request; };
> > +
> > +struct az_blob_file_ctx {
> > + struct list_head list;
> > +
> > + /* List of pending requests to VSP */
> > + struct list_head vsp_pending_requests;
> > + spinlock_t vsp_pending_lock;
> > + wait_queue_head_t wait_vsp_pending;
> > +};
> > +
> > +/* The maximum number of pages we can pass to VSP in a single packet
> > +*/ #define AZ_BLOB_MAX_PAGES 8192
> > +
> > +#ifdef CONFIG_DEBUG_FS
> > +struct dentry *az_blob_debugfs_root;
> > +#endif
> > +
> > +static struct az_blob_device az_blob_dev;
> > +
> > +static int az_blob_ringbuffer_size = (128 * 1024);
> > +module_param(az_blob_ringbuffer_size, int, 0444);
> > +MODULE_PARM_DESC(az_blob_ringbuffer_size, "Ring buffer size
> > +(bytes)");
> > +
> > +static const struct hv_vmbus_device_id id_table[] = {
> > + { HV_AZURE_BLOB_GUID,
> > + .driver_data = 0
> > + },
> > + { },
> > +};
> > +
> > +#define AZ_ERR 0
> > +#define AZ_WARN 1
> > +#define AZ_DBG 2
> > +static int log_level = AZ_ERR;
> > +module_param(log_level, int, 0644);
> > +MODULE_PARM_DESC(log_level,
> > + "Log level: 0 - Error (default), 1 - Warning, 2 - Debug.");
> > +
> > +static uint device_queue_depth = 1024;
> > +module_param(device_queue_depth, uint, 0444);
> > +MODULE_PARM_DESC(device_queue_depth,
> > + "System level max queue depth for this device");
> > +
> > +#define az_blob_log(level, fmt, args...) \
> > +do { \
> > + if (level <= log_level) \
> > + pr_err("%s:%d " fmt, __func__, __LINE__, ##args); \
> > +} while (0)
> > +
> > +#define az_blob_dbg(fmt, args...) az_blob_log(AZ_DBG, fmt, ##args)
> > +#define az_blob_warn(fmt, args...) az_blob_log(AZ_WARN, fmt, ##args)
> > +#define az_blob_err(fmt, args...) az_blob_log(AZ_ERR, fmt, ##args)
> > +
> > +static void az_blob_on_channel_callback(void *context) {
> > + struct vmbus_channel *channel = (struct vmbus_channel *)context;
> > + const struct vmpacket_descriptor *desc;
> > +
> > + az_blob_dbg("entering interrupt from vmbus\n");
> > + foreach_vmbus_pkt(desc, channel) {
> > + struct az_blob_vsp_request_ctx *request_ctx;
> > + struct az_blob_vsp_response *response;
> > + u64 cmd_rqst = vmbus_request_addr(&channel->requestor,
> > + desc->trans_id);
> > + if (cmd_rqst == VMBUS_RQST_ERROR) {
> > + az_blob_err("incorrect transaction id %llu\n",
> > + desc->trans_id);
> > + continue;
> > + }
> > + request_ctx = (struct az_blob_vsp_request_ctx *) cmd_rqst;
> > + response = hv_pkt_data(desc);
> > +
> > + az_blob_dbg("got response for request %pUb status %u "
> > + "response_len %u\n",
> > + &request_ctx->request->guid, response->error,
> > + response->response_len);
> > + request_ctx->request->response.status = response->error;
> > + request_ctx->request->response.response_len =
> > + response->response_len;
> > + complete(&request_ctx->wait_vsp);
> > + }
> > +
> > +}
> > +
> > +static int az_blob_fop_open(struct inode *inode, struct file *file) {
> > + struct az_blob_file_ctx *file_ctx;
> > + unsigned long flags;
> > +
> > +
> > + file_ctx = kzalloc(sizeof(*file_ctx), GFP_KERNEL);
> > + if (!file_ctx)
> > + return -ENOMEM;
> > +
> > +
> > + spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> > + if (az_blob_dev.removing) {
> > + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
> > + kfree(file_ctx);
> > + return -ENODEV;
> > + }
> > + list_add_tail(&file_ctx->list, &az_blob_dev.file_list);
> > + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
> > +
> > + INIT_LIST_HEAD(&file_ctx->vsp_pending_requests);
> > + init_waitqueue_head(&file_ctx->wait_vsp_pending);
>
> It feels a little weird to be initializing fields in the file_ctx
> *after* it has been added to the file list. But I guess it's OK since no requests
> can be issued and the file can't be closed until after this open call completes.
>
> > + file->private_data = file_ctx;
> > +
> > + return 0;
> > +}
> > +
> > +static int az_blob_fop_release(struct inode *inode, struct file
> > +*file) {
> > + struct az_blob_file_ctx *file_ctx = file->private_data;
> > + unsigned long flags;
> > +
> > + wait_event(file_ctx->wait_vsp_pending,
> > + list_empty(&file_ctx->vsp_pending_requests));
> > +
> > + spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> > + list_del(&file_ctx->list);
> > + if (list_empty(&az_blob_dev.file_list))
> > + wake_up(&az_blob_dev.file_wait);
> > + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
> > +
> > + return 0;
> > +}
> > +
> > +static inline bool az_blob_safe_file_access(struct file *file) {
> > + return file->f_cred == current_cred() && !uaccess_kernel(); }
> > +
> > +static int get_buffer_pages(int rw, void __user *buffer, u32 buffer_len,
> > + struct page ***ppages, size_t *start, size_t *num_pages) {
> > + struct iovec iov;
> > + struct iov_iter iter;
> > + int ret;
> > + ssize_t result;
> > + struct page **pages;
> > +
> > + ret = import_single_range(rw, buffer, buffer_len, &iov, &iter);
> > + if (ret) {
> > + az_blob_dbg("request buffer access error %d\n", ret);
> > + return ret;
> > + }
> > + az_blob_dbg("iov_iter type %d offset %lu count %lu nr_segs %lu\n",
> > + iter.type, iter.iov_offset, iter.count, iter.nr_segs);
> > +
> > + result = iov_iter_get_pages_alloc(&iter, &pages, buffer_len, start);
> > + if (result < 0) {
> > + az_blob_dbg("failed to pin user pages result=%ld\n", result);
> > + return result;
> > + }
> > + if (result != buffer_len) {
> > + az_blob_dbg("can't pin user pages requested %d got %ld\n",
> > + buffer_len, result);
> > + return -EFAULT;
> > + }
> > +
> > + *ppages = pages;
> > + *num_pages = (result + *start + PAGE_SIZE - 1) / PAGE_SIZE;
> > + return 0;
> > +}
> > +
> > +static void fill_in_page_buffer(u64 *pfn_array,
> > + int *index, struct page **pages, unsigned long num_pages) {
> > + int i, page_idx = *index;
> > +
> > + for (i = 0; i < num_pages; i++)
> > + pfn_array[page_idx++] = page_to_pfn(pages[i]);
> > + *index = page_idx;
> > +}
> > +
> > +static void free_buffer_pages(size_t num_pages, struct page **pages)
> > +{
> > + unsigned long i;
> > +
> > + for (i = 0; i < num_pages; i++)
> > + if (pages[i])
> > + put_page(pages[i]);
> > + kvfree(pages);
> > +}
> > +
> > +static long az_blob_ioctl_user_request(struct file *filp, unsigned
> > +long arg) {
> > + struct az_blob_device *dev = &az_blob_dev;
> > + struct az_blob_file_ctx *file_ctx = filp->private_data;
> > + char __user *argp = (char __user *) arg;
> > + struct az_blob_request_sync request;
> > + struct az_blob_vsp_request_ctx request_ctx;
> > + unsigned long flags;
> > + int ret;
> > + size_t request_start, request_num_pages = 0;
> > + size_t response_start, response_num_pages = 0;
> > + size_t data_start, data_num_pages = 0, total_num_pages;
> > + struct page **request_pages = NULL, **response_pages = NULL;
> > + struct page **data_pages = NULL;
> > + struct vmbus_packet_mpb_array *desc;
> > + u64 *pfn_array;
> > + int desc_size;
> > + int page_idx;
> > + struct az_blob_vsp_request *vsp_request;
> > +
> > + /* Fast fail if device is being removed */
> > + if (dev->removing)
> > + return -ENODEV;
> > +
> > + if (!az_blob_safe_file_access(filp)) {
> > + az_blob_dbg("process %d(%s) changed security contexts
> after"
> > + " opening file descriptor\n",
> > + task_tgid_vnr(current), current->comm);
> > + return -EACCES;
> > + }
> > +
> > + if (copy_from_user(&request, argp, sizeof(request))) {
> > + az_blob_dbg("don't have permission to user provided
> buffer\n");
> > + return -EFAULT;
> > + }
> > +
> > + az_blob_dbg("az_blob ioctl request guid %pUb timeout %u
> request_len %u"
> > + " response_len %u data_len %u request_buffer %llx "
> > + "response_buffer %llx data_buffer %llx\n",
> > + &request.guid, request.timeout, request.request_len,
> > + request.response_len, request.data_len,
> request.request_buffer,
> > + request.response_buffer, request.data_buffer);
> > +
> > + if (!request.request_len || !request.response_len)
> > + return -EINVAL;
> > +
> > + if (request.data_len && request.data_len < request.data_valid)
> > + return -EINVAL;
> > +
> > + init_completion(&request_ctx.wait_vsp);
> > + request_ctx.request = &request;
> > +
> > + /*
> > + * Need to set rw to READ to have page table set up for passing to VSP.
> > + * Setting it to WRITE will cause the page table entry not allocated
> > + * as it's waiting on Copy-On-Write on next page fault. This doesn't
> > + * work in this scenario because VSP wants all the pages to be present.
> > + */
> > + ret = get_buffer_pages(READ, (void __user *) request.request_buffer,
> > + request.request_len, &request_pages, &request_start,
> > + &request_num_pages);
> > + if (ret)
> > + goto get_user_page_failed;
> > +
> > + ret = get_buffer_pages(READ, (void __user *)
> request.response_buffer,
> > + request.response_len, &response_pages, &response_start,
> > + &response_num_pages);
> > + if (ret)
> > + goto get_user_page_failed;
> > +
> > + if (request.data_len) {
> > + ret = get_buffer_pages(READ,
> > + (void __user *) request.data_buffer, request.data_len,
> > + &data_pages, &data_start, &data_num_pages);
> > + if (ret)
> > + goto get_user_page_failed;
> > + }
>
> It still seems to me that request.request_len, request.response_len, and
> request.data_len should be validated *before* get_buffer_pages() is called.
> While the total number of pages that pinned is validated below against
> AZ_BLOB_MAX_PAGES, bad values in the *_len fields could pin up to
> 12 Gbytes of memory before the check against AZ_BLOB_MAX_PAGES
> happens. The pre-validation would make sure that each *_len value
> is less than AZ_BLOB_MAX_PAGES, so that the max amount of memory that
> could get pinned is 96 Mbytes.
>
> Also, are the *_len values required to be a multiple of PAGE_SIZE?
> I'm assuming not, and that any number of bytes is acceptable. If there is some
> requirement such as being a multiple of 512 or something like that, it should
> also be part of the pre-validation.
*len values are not required to be aligned to page size or 512.
Adding pre-validation to those *len is a good idea. I'll make changes.
>
> > +
> > + total_num_pages = request_num_pages + response_num_pages +
> > + data_num_pages;
> > + if (total_num_pages > AZ_BLOB_MAX_PAGES) {
> > + az_blob_dbg("number of DMA pages %lu buffer
> exceeding %u\n",
> > + total_num_pages, AZ_BLOB_MAX_PAGES);
> > + ret = -EINVAL;
> > + goto get_user_page_failed;
> > + }
> > +
> > + /* Construct a VMBUS packet and send it over to VSP */
> > + desc_size = sizeof(struct vmbus_packet_mpb_array) +
> > + sizeof(u64) * total_num_pages;
> > + desc = kzalloc(desc_size, GFP_KERNEL);
> > + vsp_request = kzalloc(sizeof(*vsp_request), GFP_KERNEL);
> > + if (!desc || !vsp_request) {
> > + kfree(desc);
> > + kfree(vsp_request);
> > + ret = -ENOMEM;
> > + goto get_user_page_failed;
> > + }
> > +
> > + desc->range.offset = 0;
> > + desc->range.len = total_num_pages * PAGE_SIZE;
>
> The range.len value is always a multiple of PAGE_SIZE. Assuming that the
> request.*_len values are not required to be multiples of PAGE_SIZE, then the
> sum of the buffer_length fields in the vsp_request may not add up to
> range.len. Presumably the VSP is OK with that ....
range.len describes the length of whole array of page buffers sent to the host, the individual buffers within the page buffers are described by *_offset and *_length in the VSP packet.
>
> > + pfn_array = desc->range.pfn_array;
> > + page_idx = 0;
> > +
> > + if (request.data_len) {
> > + fill_in_page_buffer(pfn_array, &page_idx, data_pages,
> > + data_num_pages);
> > + vsp_request->data_buffer_offset = data_start;
> > + vsp_request->data_buffer_length = request.data_len;
> > + vsp_request->data_buffer_valid = request.data_valid;
> > + }
> > +
> > + fill_in_page_buffer(pfn_array, &page_idx, request_pages,
> > + request_num_pages);
> > + vsp_request->request_buffer_offset = request_start +
> > + data_num_pages * PAGE_SIZE;
> > + vsp_request->request_buffer_length = request.request_len;
> > +
> > + fill_in_page_buffer(pfn_array, &page_idx, response_pages,
> > + response_num_pages);
> > + vsp_request->response_buffer_offset = response_start +
> > + (data_num_pages + request_num_pages) * PAGE_SIZE;
> > + vsp_request->response_buffer_length = request.response_len;
>
> Interestingly, if any of the data or request or response are adjacent in user
> memory, but the boundary is not on a page boundary, the same page may
> appear in two successive pfn_array entries. Again, I'm assuming that's OK with
> the VSP. It appears that the VSP just treats the three memory areas as
> completely independent of each other but defined by sequential entries in
> the pfn_array.
Yes the VSP is okay with that. We can send duplicate page numbers to the VSP if the buffers don't end with page boundary, and are next to each other.
>
> > +
> > + vsp_request->version = 0;
> > + vsp_request->timeout_ms = request.timeout;
> > + vsp_request->operation_type = AZ_BLOB_DRIVER_USER_REQUEST;
> > + guid_copy(&vsp_request->transaction_id, &request.guid);
> > +
> > + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> > + list_add_tail(&request_ctx.list, &file_ctx->vsp_pending_requests);
> > + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
> > +
> > + az_blob_dbg("sending request to VSP\n");
> > + az_blob_dbg("desc_size %u desc->range.len %u desc-
> >range.offset %u\n",
> > + desc_size, desc->range.len, desc->range.offset);
> > + az_blob_dbg("vsp_request data_buffer_offset %u
> data_buffer_length %u "
> > + "data_buffer_valid %u request_buffer_offset %u "
> > + "request_buffer_length %u response_buffer_offset %u "
> > + "response_buffer_length %u\n",
> > + vsp_request->data_buffer_offset,
> > + vsp_request->data_buffer_length,
> > + vsp_request->data_buffer_valid,
> > + vsp_request->request_buffer_offset,
> > + vsp_request->request_buffer_length,
> > + vsp_request->response_buffer_offset,
> > + vsp_request->response_buffer_length);
> > +
> > + ret = vmbus_sendpacket_mpb_desc(dev->device->channel, desc,
> desc_size,
> > + vsp_request, sizeof(*vsp_request), (u64) &request_ctx);
> > +
> > + kfree(desc);
> > + kfree(vsp_request);
> > + if (ret)
> > + goto vmbus_send_failed;
> > +
> > + wait_for_completion(&request_ctx.wait_vsp);
> > +
> > + /*
> > + * At this point, the response is already written to request
> > + * by VMBUS completion handler, copy them to user-mode buffers
> > + * and return to user-mode
> > + */
> > + if (copy_to_user(argp +
> > + offsetof(struct az_blob_request_sync,
> > + response.status),
> > + &request.response.status,
> > + sizeof(request.response.status))) {
> > + ret = -EFAULT;
> > + goto vmbus_send_failed;
> > + }
> > +
> > + if (copy_to_user(argp +
> > + offsetof(struct az_blob_request_sync,
> > + response.response_len),
> > + &request.response.response_len,
> > + sizeof(request.response.response_len)))
> > + ret = -EFAULT;
> > +
> > +vmbus_send_failed:
> > + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> > + list_del(&request_ctx.list);
> > + if (list_empty(&file_ctx->vsp_pending_requests))
> > + wake_up(&file_ctx->wait_vsp_pending);
> > + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
> > +
> > +get_user_page_failed:
> > + free_buffer_pages(request_num_pages, request_pages);
> > + free_buffer_pages(response_num_pages, response_pages);
> > + free_buffer_pages(data_num_pages, data_pages);
> > +
> > + return ret;
> > +}
> > +
> > +static long az_blob_fop_ioctl(struct file *filp, unsigned int cmd,
> > + unsigned long arg)
> > +{
> > + long ret = -EIO;
> > +
> > + switch (cmd) {
> > + case IOCTL_AZ_BLOB_DRIVER_USER_REQUEST:
> > + if (_IOC_SIZE(cmd) != sizeof(struct az_blob_request_sync))
> > + return -EINVAL;
> > + ret = az_blob_ioctl_user_request(filp, arg);
> > + break;
> > +
> > + default:
> > + az_blob_dbg("unrecognized IOCTL code %u\n", cmd);
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +static const struct file_operations az_blob_client_fops = {
> > + .owner = THIS_MODULE,
> > + .open = az_blob_fop_open,
> > + .unlocked_ioctl = az_blob_fop_ioctl,
> > + .release = az_blob_fop_release,
> > +};
> > +
> > +static struct miscdevice az_blob_misc_device = {
> > + MISC_DYNAMIC_MINOR,
> > + "azure_blob",
> > + &az_blob_client_fops,
> > +};
> > +
> > +static int az_blob_show_pending_requests(struct seq_file *m, void *v)
> > +{
> > + unsigned long flags, flags2;
> > + struct az_blob_vsp_request_ctx *request_ctx;
> > + struct az_blob_file_ctx *file_ctx;
> > +
> > + seq_puts(m, "List of pending requests\n");
> > + seq_puts(m, "UUID request_len response_len data_len "
> > + "request_buffer response_buffer data_buffer\n");
> > + spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> > + list_for_each_entry(file_ctx, &az_blob_dev.file_list, list) {
> > + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags2);
> > + list_for_each_entry(request_ctx,
> > + &file_ctx->vsp_pending_requests, list) {
> > + seq_printf(m, "%pUb ", &request_ctx->request->guid);
> > + seq_printf(m, "%u ", request_ctx->request-
> >request_len);
> > + seq_printf(m,
> > + "%u ", request_ctx->request->response_len);
> > + seq_printf(m, "%u ", request_ctx->request->data_len);
> > + seq_printf(m,
> > + "%llx ", request_ctx->request-
> >request_buffer);
> > + seq_printf(m,
> > + "%llx ", request_ctx->request-
> >response_buffer);
> > + seq_printf(m,
> > + "%llx\n", request_ctx->request->data_buffer);
> > + }
> > + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags2);
> > + }
> > + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
> > +
> > + return 0;
> > +}
> > +
> > +static int az_blob_debugfs_open(struct inode *inode, struct file
> > +*file) {
> > + return single_open(file, az_blob_show_pending_requests, NULL); }
> > +
> > +static const struct file_operations az_blob_debugfs_fops = {
> > + .open = az_blob_debugfs_open,
> > + .read = seq_read,
> > + .llseek = seq_lseek,
> > + .release = seq_release
> > +};
> > +
> > +static void az_blob_remove_device(struct az_blob_device *dev) {
> > + wait_event(dev->file_wait, list_empty(&dev->file_list));
> > + misc_deregister(&az_blob_misc_device);
> > +#ifdef CONFIG_DEBUG_FS
> > + debugfs_remove_recursive(az_blob_debugfs_root);
> > +#endif
> > + /* At this point, we won't get any requests from user-mode */ }
> > +
> > +static int az_blob_create_device(struct az_blob_device *dev) {
> > + int rc;
> > + struct dentry *d;
> > +
> > + rc = misc_register(&az_blob_misc_device);
> > + if (rc) {
> > + az_blob_err("misc_register failed rc %d\n", rc);
> > + return rc;
> > + }
> > +
> > +#ifdef CONFIG_DEBUG_FS
> > + az_blob_debugfs_root = debugfs_create_dir("az_blob", NULL);
> > + if (!IS_ERR_OR_NULL(az_blob_debugfs_root)) {
> > + d = debugfs_create_file("pending_requests", 0400,
> > + az_blob_debugfs_root, NULL,
> > + &az_blob_debugfs_fops);
> > + if (IS_ERR_OR_NULL(d)) {
> > + az_blob_warn("failed to create debugfs file\n");
> > + debugfs_remove_recursive(az_blob_debugfs_root);
> > + az_blob_debugfs_root = NULL;
> > + }
> > + } else
> > + az_blob_warn("failed to create debugfs root\n"); #endif
> > +
> > + return 0;
> > +}
> > +
> > +static int az_blob_connect_to_vsp(struct hv_device *device, u32
> > +ring_size) {
> > + int ret;
> > +
> > + spin_lock_init(&az_blob_dev.file_lock);
>
> I'd argue that the spin lock should not be re-initialized here. Here's the
> sequence where things go wrong:
>
> 1) The driver is unbound, so az_blob_remove() is called.
> 2) az_blob_remove() sets the "removing" flag to true, and calls
> az_blob_remove_device().
> 3) az_blob_remove_device() waits for the file_list to become empty.
> 4) After the file_list becomes empty, but before misc_deregister() is called, a
> separate thread opens the device again.
> 5) In the separate thread, az_blob_fop_open() obtains the file_lock spin lock.
> 6) Before az_blob_fop_open() releases the spin lock, az
> block_remove_device() completes, along with az_blob_remove().
> 7) Then the device gets rebound, and az_blob_connect_to_vsp() gets called,
> all while az_blob_fop_open() still holds the spin lock. So the spin lock get re-
> initialized while it is held.
>
> This is admittedly a far-fetched scenario, but stranger things have
> happened. :-) The issue is that you are counting on the az_blob_dev structure
> to persist and have a valid file_lock, even while the device is unbound. So any
> initialization should only happen in az_blob_drv_init().
I'm not sure if az_blob_probe() and az_blob_remove() can be called at the same time, as az_blob_remove_vmbus() is called the last in az_blob_remove(). Is it possible for vmbus asking the driver to probe a new channel before the old channel is closed? I expect the vmbus provide guarantee that those calls are made in sequence.
>
> > + INIT_LIST_HEAD(&az_blob_dev.file_list);
> > + init_waitqueue_head(&az_blob_dev.file_wait);
> > +
> > + az_blob_dev.removing = false;
> > +
> > + az_blob_dev.device = device;
> > + device->channel->rqstor_size = device_queue_depth;
> > +
> > + ret = vmbus_open(device->channel, ring_size, ring_size, NULL, 0,
> > + az_blob_on_channel_callback, device->channel);
> > +
> > + if (ret) {
> > + az_blob_err("failed to connect to VSP ret %d\n", ret);
> > + return ret;
> > + }
> > +
> > + hv_set_drvdata(device, &az_blob_dev);
> > +
> > + return ret;
> > +}
> > +
> > +static void az_blob_remove_vmbus(struct hv_device *device) {
> > + /* At this point, no VSC/VSP traffic is possible over vmbus */
> > + hv_set_drvdata(device, NULL);
> > + vmbus_close(device->channel);
> > +}
> > +
> > +static int az_blob_probe(struct hv_device *device,
> > + const struct hv_vmbus_device_id *dev_id) {
> > + int rc;
> > +
> > + az_blob_dbg("probing device\n");
> > +
> > + rc = az_blob_connect_to_vsp(device, az_blob_ringbuffer_size);
> > + if (rc) {
> > + az_blob_err("error connecting to VSP rc %d\n", rc);
> > + return rc;
> > + }
> > +
> > + // create user-mode client library facing device
> > + rc = az_blob_create_device(&az_blob_dev);
> > + if (rc) {
> > + az_blob_remove_vmbus(device);
> > + return rc;
> > + }
> > +
> > + az_blob_dbg("successfully probed device\n");
> > + return 0;
> > +}
> > +
> > +static int az_blob_remove(struct hv_device *dev) {
> > + struct az_blob_device *device = hv_get_drvdata(dev);
> > + unsigned long flags;
> > +
> > + spin_lock_irqsave(&device->file_lock, flags);
> > + device->removing = true;
> > + spin_unlock_irqrestore(&device->file_lock, flags);
> > +
> > + az_blob_remove_device(device);
> > + az_blob_remove_vmbus(dev);
> > + return 0;
> > +}
> > +
> > +static struct hv_driver az_blob_drv = {
> > + .name = KBUILD_MODNAME,
> > + .id_table = id_table,
> > + .probe = az_blob_probe,
> > + .remove = az_blob_remove,
> > + .driver = {
> > + .probe_type = PROBE_PREFER_ASYNCHRONOUS,
> > + },
> > +};
> > +
> > +static int __init az_blob_drv_init(void) {
> > + int ret;
> > +
> > + ret = vmbus_driver_register(&az_blob_drv);
> > + return ret;
> > +}
> > +
> > +static void __exit az_blob_drv_exit(void) {
> > + vmbus_driver_unregister(&az_blob_drv);
> > +}
> > +
> > +MODULE_LICENSE("Dual BSD/GPL");
> > +MODULE_DESCRIPTION("Microsoft Azure Blob driver");
> > +module_init(az_blob_drv_init); module_exit(az_blob_drv_exit);
> > diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> > index 0c75662..436fdeb 100644
> > --- a/drivers/hv/channel_mgmt.c
> > +++ b/drivers/hv/channel_mgmt.c
> > @@ -154,6 +154,13 @@
> > .allowed_in_isolated = false,
> > },
> >
> > + /* Azure Blob */
> > + { .dev_type = HV_AZURE_BLOB,
> > + HV_AZURE_BLOB_GUID,
> > + .perf_device = false,
> > + .allowed_in_isolated = false,
> > + },
> > +
> > /* Unknown GUID */
> > { .dev_type = HV_UNKNOWN,
> > .perf_device = false,
> > diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index
> > d1e59db..ac31362 100644
> > --- a/include/linux/hyperv.h
> > +++ b/include/linux/hyperv.h
> > @@ -772,6 +772,7 @@ enum vmbus_device_type {
> > HV_FCOPY,
> > HV_BACKUP,
> > HV_DM,
> > + HV_AZURE_BLOB,
> > HV_UNKNOWN,
> > };
> >
> > @@ -1350,6 +1351,14 @@ int vmbus_allocate_mmio(struct resource
> **new, struct hv_device *device_obj,
> > 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f)
> >
> > /*
> > + * Azure Blob GUID
> > + * {0590b792-db64-45cc-81db-b8d70c577c9e}
> > + */
> > +#define HV_AZURE_BLOB_GUID \
> > + .guid = GUID_INIT(0x0590b792, 0xdb64, 0x45cc, 0x81, 0xdb, \
> > + 0xb8, 0xd7, 0x0c, 0x57, 0x7c, 0x9e)
> > +
> > +/*
> > * Shutdown GUID
> > * {0e0b6031-5213-4934-818b-38d90ced39db}
> > */
> > diff --git a/include/uapi/misc/azure_blob.h
> > b/include/uapi/misc/azure_blob.h new file mode 100644 index
> > 0000000..29413fc
> > --- /dev/null
> > +++ b/include/uapi/misc/azure_blob.h
> > @@ -0,0 +1,31 @@
> > +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
> > +/* Copyright (c) 2021, Microsoft Corporation. */
> > +
> > +#ifndef _AZ_BLOB_H
> > +#define _AZ_BLOB_H
>
> Seems like a #include of asm/ioctl.h (or something similar) is needed so that
> _IOWR is defined. Also, a #include is needed for __u32, __aligned_u64,
> guid_t, etc.
Yes, somehow it can still compile because they are defined in azure_blob.c. Will fix this.
>
> > +
> > +/* user-mode sync request sent through ioctl */ struct
> > +az_blob_request_sync_response {
> > + __u32 status;
> > + __u32 response_len;
> > +};
> > +
> > +struct az_blob_request_sync {
> > + guid_t guid;
> > + __u32 timeout;
> > + __u32 request_len;
> > + __u32 response_len;
> > + __u32 data_len;
> > + __u32 data_valid;
> > + __aligned_u64 request_buffer;
> > + __aligned_u64 response_buffer;
> > + __aligned_u64 data_buffer;
> > + struct az_blob_request_sync_response response; };
> > +
> > +#define AZ_BLOB_MAGIC_NUMBER 'R'
> > +#define IOCTL_AZ_BLOB_DRIVER_USER_REQUEST \
> > + _IOWR(AZ_BLOB_MAGIC_NUMBER, 0xf0, \
> > + struct az_blob_request_sync)
> > +
> > +#endif /* define _AZ_BLOB_H */
> > --
> > 1.8.3.1
> Subject: Re: [Patch v2 2/3] Drivers: hv: add Azure Blob driver
>
> On Fri, Jun 25, 2021 at 11:30:19PM -0700, [email protected] wrote:
> > @@ -0,0 +1,655 @@
> > +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
>
> If you _REALLY_ want to do this, please document _WHY_ you are doing this
> in the changelog comment as dual-licensed Linux kernel code will just cause
> you a lot of problems in the long run and we want to be sure you understand
> all of the issues here and your managers agree with it.
>
> thanks,
>
> greg k-h
Will fix this.
Thanks,
Long
> Subject: Re: [Patch v2 2/3] Drivers: hv: add Azure Blob driver
>
> On Fri, Jun 25, 2021 at 11:30:19PM -0700, [email protected] wrote:
> > +#ifdef CONFIG_DEBUG_FS
> > +struct dentry *az_blob_debugfs_root;
> > +#endif
>
> No need to keep this dentry, just look it up if you need it.
Will fix this.
>
> > +
> > +static struct az_blob_device az_blob_dev;
> > +
> > +static int az_blob_ringbuffer_size = (128 * 1024);
> > +module_param(az_blob_ringbuffer_size, int, 0444);
> > +MODULE_PARM_DESC(az_blob_ringbuffer_size, "Ring buffer size
> > +(bytes)");
>
> This is NOT the 1990's, please do not create new module parameters.
> Just make this work properly for everyone.
The default value is chosen so that it works best for most workloads while not taking too much system resources. It should work out of box for nearly all customers.
But what we see is that from time to time, some customers still want to change those values to work best for their workloads. It's hard to predict their workload. They have to recompile the kernel if there is no module parameter to do it. So the intent of this module parameter is that if the default value works for you, don't mess up with it.
>
> > +#define AZ_ERR 0
> > +#define AZ_WARN 1
> > +#define AZ_DBG 2
> > +static int log_level = AZ_ERR;
> > +module_param(log_level, int, 0644);
> > +MODULE_PARM_DESC(log_level,
> > + "Log level: 0 - Error (default), 1 - Warning, 2 - Debug.");
>
> A single driver does not need a special debug/log level, use the system-wide
> functions and all will "just work"
Will fix this.
>
> > +
> > +static uint device_queue_depth = 1024;
> > +module_param(device_queue_depth, uint, 0444);
> > +MODULE_PARM_DESC(device_queue_depth,
> > + "System level max queue depth for this device");
> > +
> > +#define az_blob_log(level, fmt, args...) \
> > +do { \
> > + if (level <= log_level) \
> > + pr_err("%s:%d " fmt, __func__, __LINE__, ##args); \
> > +} while (0)
> > +
> > +#define az_blob_dbg(fmt, args...) az_blob_log(AZ_DBG, fmt, ##args)
> > +#define az_blob_warn(fmt, args...) az_blob_log(AZ_WARN, fmt, ##args)
> > +#define az_blob_err(fmt, args...) az_blob_log(AZ_ERR, fmt, ##args)
>
> Again, no.
>
> Just use dev_dbg(), dev_warn(), and dev_err() and there is no need for
> anything "special". This is just one tiny driver, do not rewrite logic like this for
> no reason.
>
> > +static void az_blob_remove_device(struct az_blob_device *dev) {
> > + wait_event(dev->file_wait, list_empty(&dev->file_list));
> > + misc_deregister(&az_blob_misc_device);
> > +#ifdef CONFIG_DEBUG_FS
>
> No need for the #ifdef.
>
> > + debugfs_remove_recursive(az_blob_debugfs_root);
> > +#endif
> > + /* At this point, we won't get any requests from user-mode */ }
> > +
> > +static int az_blob_create_device(struct az_blob_device *dev) {
> > + int rc;
> > + struct dentry *d;
> > +
> > + rc = misc_register(&az_blob_misc_device);
> > + if (rc) {
> > + az_blob_err("misc_register failed rc %d\n", rc);
> > + return rc;
> > + }
> > +
> > +#ifdef CONFIG_DEBUG_FS
>
> No need for the #ifdef
>
> > + az_blob_debugfs_root = debugfs_create_dir("az_blob", NULL);
> > + if (!IS_ERR_OR_NULL(az_blob_debugfs_root)) {
>
> No need to check this.
>
> > + d = debugfs_create_file("pending_requests", 0400,
> > + az_blob_debugfs_root, NULL,
> > + &az_blob_debugfs_fops);
> > + if (IS_ERR_OR_NULL(d)) {
>
> How can that be NULL?
>
> No need to ever check any debugfs calls, please just make them and move on.
Will fix this.
Thank you,
Long
>
> thanks,
>
> greg k-h
> Subject: Re: [Patch v2 2/3] Drivers: hv: add Azure Blob driver
>
> On 26. 06. 21, 8:30, [email protected] wrote:
> > Azure Blob storage provides scalable and durable data storage for Azure.
> > (https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fazu
> > re.microsoft.com%2Fen-
> us%2Fservices%2Fstorage%2Fblobs%2F&data=04%7
> >
> C01%7Clongli%40microsoft.com%7Cf5787b443bd04a8a2db708d93ad9ec59
> %7C72f9
> >
> 88bf86f141af91ab2d7cd011db47%7C1%7C0%7C637605529942807730%7C
> Unknown%7C
> >
> TWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJ
> XVC
> >
> I6Mn0%3D%7C1000&sdata=bu0G8il7D%2F3emZkPn8FXIHWWff0WPORF
> 5CBfHAsIOI
> > 4%3D&reserved=0)
> >
> > This driver adds support for accelerated access to Azure Blob storage.
> > As an alternative to REST APIs, it provides a fast data path that uses
> > host native network stack and secure direct data link for storage server
> access.
> >
> ...
> > Cc: Jiri Slaby <[email protected]>
>
> I am just curious, why am I CCed? Anyway, some comments below:
>
> > --- /dev/null
> > +++ b/drivers/hv/azure_blob.c
> > @@ -0,0 +1,655 @@
> ...
> > +static void az_blob_on_channel_callback(void *context) {
> > + struct vmbus_channel *channel = (struct vmbus_channel *)context;
>
> Have you fed this patch through checkpatch?
Yes, it didn't throw out any errors.
>
> > + const struct vmpacket_descriptor *desc;
> > +
> > + az_blob_dbg("entering interrupt from vmbus\n");
> > + foreach_vmbus_pkt(desc, channel) {
> > + struct az_blob_vsp_request_ctx *request_ctx;
> > + struct az_blob_vsp_response *response;
> > + u64 cmd_rqst = vmbus_request_addr(&channel->requestor,
> > + desc->trans_id);
> > + if (cmd_rqst == VMBUS_RQST_ERROR) {
> > + az_blob_err("incorrect transaction id %llu\n",
> > + desc->trans_id);
> > + continue;
> > + }
> > + request_ctx = (struct az_blob_vsp_request_ctx *) cmd_rqst;
> > + response = hv_pkt_data(desc);
> > +
> > + az_blob_dbg("got response for request %pUb status %u "
> > + "response_len %u\n",
> > + &request_ctx->request->guid, response->error,
> > + response->response_len);
> > + request_ctx->request->response.status = response->error;
> > + request_ctx->request->response.response_len =
> > + response->response_len;
> > + complete(&request_ctx->wait_vsp);
> > + }
> > +
> > +}
> ...
> > +static long az_blob_ioctl_user_request(struct file *filp, unsigned
> > +long arg) {
> > + struct az_blob_device *dev = &az_blob_dev;
> > + struct az_blob_file_ctx *file_ctx = filp->private_data;
> > + char __user *argp = (char __user *) arg;
> > + struct az_blob_request_sync request;
> > + struct az_blob_vsp_request_ctx request_ctx;
> > + unsigned long flags;
> > + int ret;
> > + size_t request_start, request_num_pages = 0;
> > + size_t response_start, response_num_pages = 0;
> > + size_t data_start, data_num_pages = 0, total_num_pages;
> > + struct page **request_pages = NULL, **response_pages = NULL;
> > + struct page **data_pages = NULL;
> > + struct vmbus_packet_mpb_array *desc;
> > + u64 *pfn_array;
> > + int desc_size;
> > + int page_idx;
> > + struct az_blob_vsp_request *vsp_request;
>
> Ugh, see further.
>
> > +
> > + /* Fast fail if device is being removed */
> > + if (dev->removing)
> > + return -ENODEV;
> > +
> > + if (!az_blob_safe_file_access(filp)) {
> > + az_blob_dbg("process %d(%s) changed security contexts
> after"
> > + " opening file descriptor\n",
> > + task_tgid_vnr(current), current->comm);
> > + return -EACCES;
> > + }
> > +
> > + if (copy_from_user(&request, argp, sizeof(request))) {
> > + az_blob_dbg("don't have permission to user provided
> buffer\n");
> > + return -EFAULT;
> > + }
> > +
> > + az_blob_dbg("az_blob ioctl request guid %pUb timeout %u
> request_len %u"
> > + " response_len %u data_len %u request_buffer %llx "
> > + "response_buffer %llx data_buffer %llx\n",
> > + &request.guid, request.timeout, request.request_len,
> > + request.response_len, request.data_len,
> request.request_buffer,
> > + request.response_buffer, request.data_buffer);
> > +
> > + if (!request.request_len || !request.response_len)
> > + return -EINVAL;
> > +
> > + if (request.data_len && request.data_len < request.data_valid)
> > + return -EINVAL;
> > +
> > + init_completion(&request_ctx.wait_vsp);
> > + request_ctx.request = &request;
> > +
> > + /*
> > + * Need to set rw to READ to have page table set up for passing to VSP.
> > + * Setting it to WRITE will cause the page table entry not allocated
> > + * as it's waiting on Copy-On-Write on next page fault. This doesn't
> > + * work in this scenario because VSP wants all the pages to be present.
> > + */
> > + ret = get_buffer_pages(READ, (void __user *) request.request_buffer,
>
> Does this need __force for sparse not to complain?
I didn't run sparse for the patch. Will look into it. Thanks for the pointer.
>
> > + request.request_len, &request_pages, &request_start,
> > + &request_num_pages);
> > + if (ret)
> > + goto get_user_page_failed;
> > +
> > + ret = get_buffer_pages(READ, (void __user *)
> request.response_buffer,
> > + request.response_len, &response_pages, &response_start,
> > + &response_num_pages);
> > + if (ret)
> > + goto get_user_page_failed;
> > +
> > + if (request.data_len) {
> > + ret = get_buffer_pages(READ,
> > + (void __user *) request.data_buffer, request.data_len,
> > + &data_pages, &data_start, &data_num_pages);
> > + if (ret)
> > + goto get_user_page_failed;
> > + }
> > +
> > + total_num_pages = request_num_pages + response_num_pages +
> > + data_num_pages;
> > + if (total_num_pages > AZ_BLOB_MAX_PAGES) {
> > + az_blob_dbg("number of DMA pages %lu buffer
> exceeding %u\n",
> > + total_num_pages, AZ_BLOB_MAX_PAGES);
> > + ret = -EINVAL;
> > + goto get_user_page_failed;
> > + }
> > +
> > + /* Construct a VMBUS packet and send it over to VSP */
> > + desc_size = sizeof(struct vmbus_packet_mpb_array) +
> > + sizeof(u64) * total_num_pages;
> > + desc = kzalloc(desc_size, GFP_KERNEL);
>
> Smells like a call for struct_size().
Yes it's a good idea to use struct_size. Will change this.
>
> > + vsp_request = kzalloc(sizeof(*vsp_request), GFP_KERNEL);
> > + if (!desc || !vsp_request) {
> > + kfree(desc);
> > + kfree(vsp_request);
> > + ret = -ENOMEM;
> > + goto get_user_page_failed;
> > + }
> > +
> > + desc->range.offset = 0;
> > + desc->range.len = total_num_pages * PAGE_SIZE;
> > + pfn_array = desc->range.pfn_array;
> > + page_idx = 0;
> > +
> > + if (request.data_len) {
> > + fill_in_page_buffer(pfn_array, &page_idx, data_pages,
> > + data_num_pages);
> > + vsp_request->data_buffer_offset = data_start;
> > + vsp_request->data_buffer_length = request.data_len;
> > + vsp_request->data_buffer_valid = request.data_valid;
> > + }
> > +
> > + fill_in_page_buffer(pfn_array, &page_idx, request_pages,
> > + request_num_pages);
> > + vsp_request->request_buffer_offset = request_start +
> > + data_num_pages * PAGE_SIZE;
> > + vsp_request->request_buffer_length = request.request_len;
> > +
> > + fill_in_page_buffer(pfn_array, &page_idx, response_pages,
> > + response_num_pages);
> > + vsp_request->response_buffer_offset = response_start +
> > + (data_num_pages + request_num_pages) * PAGE_SIZE;
> > + vsp_request->response_buffer_length = request.response_len;
> > +
> > + vsp_request->version = 0;
> > + vsp_request->timeout_ms = request.timeout;
> > + vsp_request->operation_type = AZ_BLOB_DRIVER_USER_REQUEST;
> > + guid_copy(&vsp_request->transaction_id, &request.guid);
> > +
> > + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> > + list_add_tail(&request_ctx.list, &file_ctx->vsp_pending_requests);
> > + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
> > +
> > + az_blob_dbg("sending request to VSP\n");
> > + az_blob_dbg("desc_size %u desc->range.len %u desc-
> >range.offset %u\n",
> > + desc_size, desc->range.len, desc->range.offset);
> > + az_blob_dbg("vsp_request data_buffer_offset %u
> data_buffer_length %u "
> > + "data_buffer_valid %u request_buffer_offset %u "
> > + "request_buffer_length %u response_buffer_offset %u "
> > + "response_buffer_length %u\n",
> > + vsp_request->data_buffer_offset,
> > + vsp_request->data_buffer_length,
> > + vsp_request->data_buffer_valid,
> > + vsp_request->request_buffer_offset,
> > + vsp_request->request_buffer_length,
> > + vsp_request->response_buffer_offset,
> > + vsp_request->response_buffer_length);
> > +
> > + ret = vmbus_sendpacket_mpb_desc(dev->device->channel, desc,
> desc_size,
> > + vsp_request, sizeof(*vsp_request), (u64) &request_ctx);
> > +
> > + kfree(desc);
> > + kfree(vsp_request);
> > + if (ret)
> > + goto vmbus_send_failed;
> > +
> > + wait_for_completion(&request_ctx.wait_vsp);
>
> Provided this is ioctl, this should likely be interruptible. You don't want to
> account to I/O load. The same likely for az_blob_fop_release.
The reason for wait is that the memory for I/O is pinned and passed to the host for direct I/O. Now the host owns those memory, and we can't do anything to those memory until the host releases ownership.
>
> > +
> > + /*
> > + * At this point, the response is already written to request
> > + * by VMBUS completion handler, copy them to user-mode buffers
> > + * and return to user-mode
> > + */
> > + if (copy_to_user(argp +
> > + offsetof(struct az_blob_request_sync,
> > + response.status),
> > + &request.response.status,
>
> This is ugly, why don't you make argp the appropriate pointer instead of char
> *? You'd need not do copy_to_user twice then, right?
>
> > + sizeof(request.response.status))) {
> > + ret = -EFAULT;
> > + goto vmbus_send_failed;
> > + }
> > +
> > + if (copy_to_user(argp +
> > + offsetof(struct az_blob_request_sync,
> > + response.response_len),
>
> The same here.
I'll fix the pointers.
>
> > + &request.response.response_len,
> > + sizeof(request.response.response_len)))
> > + ret = -EFAULT;
> > +
> > +vmbus_send_failed:
> > + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> > + list_del(&request_ctx.list);
> > + if (list_empty(&file_ctx->vsp_pending_requests))
> > + wake_up(&file_ctx->wait_vsp_pending);
> > + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
> > +
> > +get_user_page_failed:
> > + free_buffer_pages(request_num_pages, request_pages);
> > + free_buffer_pages(response_num_pages, response_pages);
> > + free_buffer_pages(data_num_pages, data_pages);
> > +
> > + return ret;
> > +}
>
> This function is overly long. Care to split it (e.g. moving away the initialization
> of the structs and the debug stuff)?
This is true. I'm looking for ways to refactor it.
>
> > +
> > +static long az_blob_fop_ioctl(struct file *filp, unsigned int cmd,
> > + unsigned long arg)
> > +{
> > + long ret = -EIO;
>
> EINVAL would be more appropriate.
Good point. I'll fix it.
>
> > +
> > + switch (cmd) {
> > + case IOCTL_AZ_BLOB_DRIVER_USER_REQUEST:
> > + if (_IOC_SIZE(cmd) != sizeof(struct az_blob_request_sync))
> > + return -EINVAL;
>
> How can that happen, provided the switch (cmd) and case?
I see some other drivers checking it. It's probably not needed. Will remove it.
>
> > + ret = az_blob_ioctl_user_request(filp, arg);
> > + break;
>
> Simply:
> return az_blob_ioctl_user_request(filp, arg);
>
> > +
> > + default:
> > + az_blob_dbg("unrecognized IOCTL code %u\n", cmd);
> > + }
> > +
> > + return ret;
>
> So return -EINVAL here directly now.
>
> > +}
> ...
> > +static int az_blob_connect_to_vsp(struct hv_device *device, u32
> > +ring_size) {
> > + int ret;
> ...
> > + int rc;
>
>
> Sometimes ret, sometimes rc, could you unify the two?
Will fix this.
>
> > +static int __init az_blob_drv_init(void) {
> > + int ret;
> > +
> > + ret = vmbus_driver_register(&az_blob_drv);
> > + return ret;
>
> Simply:
> return vmbus_driver_register(&az_blob_drv);
>
> regards,
> --
> --
> js
> suse labs
Thank you!
Long
On Thu, Jul 01, 2021 at 06:58:35AM +0000, Long Li wrote:
> > Subject: Re: [Patch v2 2/3] Drivers: hv: add Azure Blob driver
> > > +
> > > +static struct az_blob_device az_blob_dev;
> > > +
> > > +static int az_blob_ringbuffer_size = (128 * 1024);
> > > +module_param(az_blob_ringbuffer_size, int, 0444);
> > > +MODULE_PARM_DESC(az_blob_ringbuffer_size, "Ring buffer size
> > > +(bytes)");
> >
> > This is NOT the 1990's, please do not create new module parameters.
> > Just make this work properly for everyone.
>
> The default value is chosen so that it works best for most workloads while not taking too much system resources. It should work out of box for nearly all customers.
Please wrap your email lines :(
Anyway, great, it will work for everyone, no need for this to be
changed. Then just drop it.
> But what we see is that from time to time, some customers still want to change those values to work best for their workloads. It's hard to predict their workload. They have to recompile the kernel if there is no module parameter to do it. So the intent of this module parameter is that if the default value works for you, don't mess up with it.
Can you not just determine at runtime that these workloads are
overflowing the buffer and increase the size of it? That would be best
otherwise you are forced to try to deal with a module parameter that is
for code, not for devices (which is why we do not like module parameters
for drivers.)
Please remove this for now and if you _REALLY_ need it in the future,
make it auto-tunable. Or if that is somehow impossible, then make it a
per-device option, through something like sysfs so that it will work
correctly per-device.
thanks,
greg k-h
On Thu, 2021-07-01 at 07:09 +0000, Long Li wrote:
> > On 26. 06. 21, 8:30, [email protected] wrote:
> > Have you fed this patch through checkpatch?
>
> Yes, it didn't throw out any errors.
Several warnings and checks though.
$ ./scripts/checkpatch.pl 2.patch --strict --terse
2.patch:68: WARNING: Possible unwrapped commit description (prefer a maximum 75 chars per line)
2.patch:148: WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
2.patch:173: CHECK: spinlock_t definition without comment
2.patch:220: CHECK: spinlock_t definition without comment
2.patch:250: CHECK: Alignment should match open parenthesis
2.patch:255: CHECK: Alignment should match open parenthesis
2.patch:257: CHECK: Macro argument 'level' may be better as '(level)' to avoid precedence issues
2.patch:280: CHECK: Alignment should match open parenthesis
2.patch:283: CHECK: No space is necessary after a cast
2.patch:287: WARNING: quoted string split across lines
2.patch:296: CHECK: Blank lines aren't necessary before a close brace '}'
2.patch:303: CHECK: Please don't use multiple blank lines
2.patch:308: CHECK: Please don't use multiple blank lines
2.patch:331: CHECK: Alignment should match open parenthesis
2.patch:348: CHECK: Alignment should match open parenthesis
2.patch:362: CHECK: Alignment should match open parenthesis
2.patch:371: CHECK: Alignment should match open parenthesis
2.patch:381: CHECK: Alignment should match open parenthesis
2.patch:404: CHECK: No space is necessary after a cast
2.patch:426: WARNING: quoted string split across lines
2.patch:437: WARNING: quoted string split across lines
2.patch:438: WARNING: quoted string split across lines
2.patch:458: CHECK: No space is necessary after a cast
2.patch:459: CHECK: Alignment should match open parenthesis
2.patch:464: CHECK: No space is necessary after a cast
2.patch:465: CHECK: Alignment should match open parenthesis
2.patch:472: CHECK: Alignment should match open parenthesis
2.patch:472: CHECK: No space is necessary after a cast
2.patch:482: CHECK: Alignment should match open parenthesis
2.patch:506: CHECK: Alignment should match open parenthesis
2.patch:513: CHECK: Alignment should match open parenthesis
2.patch:519: CHECK: Alignment should match open parenthesis
2.patch:535: CHECK: Alignment should match open parenthesis
2.patch:537: WARNING: quoted string split across lines
2.patch:538: WARNING: quoted string split across lines
2.patch:539: WARNING: quoted string split across lines
2.patch:549: CHECK: Alignment should match open parenthesis
2.patch:549: CHECK: No space is necessary after a cast
2.patch:565: CHECK: Alignment should match open parenthesis
2.patch:574: CHECK: Alignment should match open parenthesis
2.patch:595: CHECK: Alignment should match open parenthesis
2.patch:634: WARNING: quoted string split across lines
2.patch:639: CHECK: Alignment should match open parenthesis
2.patch:643: CHECK: Alignment should match open parenthesis
2.patch:646: CHECK: Alignment should match open parenthesis
2.patch:648: CHECK: Alignment should match open parenthesis
2.patch:650: CHECK: Alignment should match open parenthesis
2.patch:694: CHECK: braces {} should be used on all arms of this statement
2.patch:696: CHECK: Alignment should match open parenthesis
2.patch:703: CHECK: Unbalanced braces around else statement
2.patch:724: CHECK: Alignment should match open parenthesis
2.patch:744: CHECK: Alignment should match open parenthesis
total: 0 errors, 10 warnings, 42 checks, 749 lines checked
From: Long Li <[email protected]> Sent: Wednesday, June 30, 2021 11:51 PM
[snip]
> > > +static void az_blob_remove_device(struct az_blob_device *dev) {
> > > + wait_event(dev->file_wait, list_empty(&dev->file_list));
> > > + misc_deregister(&az_blob_misc_device);
> > > +#ifdef CONFIG_DEBUG_FS
> > > + debugfs_remove_recursive(az_blob_debugfs_root);
> > > +#endif
> > > + /* At this point, we won't get any requests from user-mode */ }
> > > +
> > > +static int az_blob_create_device(struct az_blob_device *dev) {
> > > + int rc;
> > > + struct dentry *d;
> > > +
> > > + rc = misc_register(&az_blob_misc_device);
> > > + if (rc) {
> > > + az_blob_err("misc_register failed rc %d\n", rc);
> > > + return rc;
> > > + }
> > > +
> > > +#ifdef CONFIG_DEBUG_FS
> > > + az_blob_debugfs_root = debugfs_create_dir("az_blob", NULL);
> > > + if (!IS_ERR_OR_NULL(az_blob_debugfs_root)) {
> > > + d = debugfs_create_file("pending_requests", 0400,
> > > + az_blob_debugfs_root, NULL,
> > > + &az_blob_debugfs_fops);
> > > + if (IS_ERR_OR_NULL(d)) {
> > > + az_blob_warn("failed to create debugfs file\n");
> > > + debugfs_remove_recursive(az_blob_debugfs_root);
> > > + az_blob_debugfs_root = NULL;
> > > + }
> > > + } else
> > > + az_blob_warn("failed to create debugfs root\n"); #endif
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static int az_blob_connect_to_vsp(struct hv_device *device, u32
> > > +ring_size) {
> > > + int ret;
> > > +
> > > + spin_lock_init(&az_blob_dev.file_lock);
> >
> > I'd argue that the spin lock should not be re-initialized here. Here's the
> > sequence where things go wrong:
> >
> > 1) The driver is unbound, so az_blob_remove() is called.
> > 2) az_blob_remove() sets the "removing" flag to true, and calls
> > az_blob_remove_device().
> > 3) az_blob_remove_device() waits for the file_list to become empty.
> > 4) After the file_list becomes empty, but before misc_deregister() is called, a
> > separate thread opens the device again.
> > 5) In the separate thread, az_blob_fop_open() obtains the file_lock spin lock.
> > 6) Before az_blob_fop_open() releases the spin lock, az_blob_remove_device()
> > completes, along with az_blob_remove().
> > 7) Then the device gets rebound, and az_blob_connect_to_vsp() gets called,
> > all while az_blob_fop_open() still holds the spin lock. So the spin lock get re-
> > initialized while it is held.
> >
> > This is admittedly a far-fetched scenario, but stranger things have
> > happened. :-) The issue is that you are counting on the az_blob_dev structure
> > to persist and have a valid file_lock, even while the device is unbound. So any
> > initialization should only happen in az_blob_drv_init().
>
> I'm not sure if az_blob_probe() and az_blob_remove() can be called at the
> same time, as az_blob_remove_vmbus() is called the last in az_blob_remove().
> Is it possible for vmbus asking the driver to probe a new channel before the
> old channel is closed? I expect the vmbus provide guarantee that those calls
> are made in sequence.
In my scenario above, az_blob_remove_vmbus() and az_blob_remove() run to
completion in Step #6, all while some other thread is still in the middle of an
open() call and holding the file_lock spin lock. Then in Step #7 az_blob_probe()
runs. So az_blob_remove() and az_blob_probe() execute sequentially, not at
the same time.
Michael
>
> >
> > > + INIT_LIST_HEAD(&az_blob_dev.file_list);
> > > + init_waitqueue_head(&az_blob_dev.file_wait);
> > > +
> > > + az_blob_dev.removing = false;
> > > +
> > > + az_blob_dev.device = device;
> > > + device->channel->rqstor_size = device_queue_depth;
> > > +
> > > + ret = vmbus_open(device->channel, ring_size, ring_size, NULL, 0,
> > > + az_blob_on_channel_callback, device->channel);
> > > +
> > > + if (ret) {
> > > + az_blob_err("failed to connect to VSP ret %d\n", ret);
> > > + return ret;
> > > + }
> > > +
> > > + hv_set_drvdata(device, &az_blob_dev);
> > > +
> > > + return ret;
> > > +}
> > > +
> > > +static void az_blob_remove_vmbus(struct hv_device *device) {
> > > + /* At this point, no VSC/VSP traffic is possible over vmbus */
> > > + hv_set_drvdata(device, NULL);
> > > + vmbus_close(device->channel);
> > > +}
> > > +
> > > +static int az_blob_probe(struct hv_device *device,
> > > + const struct hv_vmbus_device_id *dev_id) {
> > > + int rc;
> > > +
> > > + az_blob_dbg("probing device\n");
> > > +
> > > + rc = az_blob_connect_to_vsp(device, az_blob_ringbuffer_size);
> > > + if (rc) {
> > > + az_blob_err("error connecting to VSP rc %d\n", rc);
> > > + return rc;
> > > + }
> > > +
> > > + // create user-mode client library facing device
> > > + rc = az_blob_create_device(&az_blob_dev);
> > > + if (rc) {
> > > + az_blob_remove_vmbus(device);
> > > + return rc;
> > > + }
> > > +
> > > + az_blob_dbg("successfully probed device\n");
> > > + return 0;
> > > +}
> > > +
> > > +static int az_blob_remove(struct hv_device *dev) {
> > > + struct az_blob_device *device = hv_get_drvdata(dev);
> > > + unsigned long flags;
> > > +
> > > + spin_lock_irqsave(&device->file_lock, flags);
> > > + device->removing = true;
> > > + spin_unlock_irqrestore(&device->file_lock, flags);
> > > +
> > > + az_blob_remove_device(device);
> > > + az_blob_remove_vmbus(dev);
> > > + return 0;
> > > +}
> > > +
> > > +static struct hv_driver az_blob_drv = {
> > > + .name = KBUILD_MODNAME,
> > > + .id_table = id_table,
> > > + .probe = az_blob_probe,
> > > + .remove = az_blob_remove,
> > > + .driver = {
> > > + .probe_type = PROBE_PREFER_ASYNCHRONOUS,
> > > + },
> > > +};
> > > +
> > > +static int __init az_blob_drv_init(void) {
> > > + int ret;
> > > +
> > > + ret = vmbus_driver_register(&az_blob_drv);
> > > + return ret;
> > > +}
> > > +
> > > +static void __exit az_blob_drv_exit(void) {
> > > + vmbus_driver_unregister(&az_blob_drv);
> > > +}
> Subject: Re: [Patch v2 2/3] Drivers: hv: add Azure Blob driver
>
> On Thu, Jul 01, 2021 at 06:58:35AM +0000, Long Li wrote:
> > > Subject: Re: [Patch v2 2/3] Drivers: hv: add Azure Blob driver
> > > > +
> > > > +static struct az_blob_device az_blob_dev;
> > > > +
> > > > +static int az_blob_ringbuffer_size = (128 * 1024);
> > > > +module_param(az_blob_ringbuffer_size, int, 0444);
> > > > +MODULE_PARM_DESC(az_blob_ringbuffer_size, "Ring buffer size
> > > > +(bytes)");
> > >
> > > This is NOT the 1990's, please do not create new module parameters.
> > > Just make this work properly for everyone.
> >
> > The default value is chosen so that it works best for most workloads while
> not taking too much system resources. It should work out of box for nearly all
> customers.
>
> Please wrap your email lines :(
>
> Anyway, great, it will work for everyone, no need for this to be changed.
> Then just drop it.
>
> > But what we see is that from time to time, some customers still want to
> change those values to work best for their workloads. It's hard to predict their
> workload. They have to recompile the kernel if there is no module parameter
> to do it. So the intent of this module parameter is that if the default value
> works for you, don't mess up with it.
>
> Can you not just determine at runtime that these workloads are overflowing
> the buffer and increase the size of it? That would be best otherwise you are
> forced to try to deal with a module parameter that is for code, not for devices
> (which is why we do not like module parameters for drivers.)
>
> Please remove this for now and if you _REALLY_ need it in the future, make it
> auto-tunable. Or if that is somehow impossible, then make it a per-device
> option, through something like sysfs so that it will work correctly per-device.
>
> thanks,
>
> greg k-h
I got your point, will be removing this.
Thanks,
Long
> Subject: Re: [Patch v2 2/3] Drivers: hv: add Azure Blob driver
>
> On Thu, 2021-07-01 at 07:09 +0000, Long Li wrote:
> > > On 26. 06. 21, 8:30, [email protected] wrote:
>
> > > Have you fed this patch through checkpatch?
> >
> > Yes, it didn't throw out any errors.
>
> Several warnings and checks though.
>
> $ ./scripts/checkpatch.pl 2.patch --strict --terse
Thanks for the pointer. I didn't check with "--strict".
Will fix this.
> 2.patch:68: WARNING: Possible unwrapped commit description (prefer a
> maximum 75 chars per line)
> 2.patch:148: WARNING: added, moved or deleted file(s), does MAINTAINERS
> need updating?
> 2.patch:173: CHECK: spinlock_t definition without comment
> 2.patch:220: CHECK: spinlock_t definition without comment
> 2.patch:250: CHECK: Alignment should match open parenthesis
> 2.patch:255: CHECK: Alignment should match open parenthesis
> 2.patch:257: CHECK: Macro argument 'level' may be better as '(level)' to avoid
> precedence issues
> 2.patch:280: CHECK: Alignment should match open parenthesis
> 2.patch:283: CHECK: No space is necessary after a cast
> 2.patch:287: WARNING: quoted string split across lines
> 2.patch:296: CHECK: Blank lines aren't necessary before a close brace '}'
> 2.patch:303: CHECK: Please don't use multiple blank lines
> 2.patch:308: CHECK: Please don't use multiple blank lines
> 2.patch:331: CHECK: Alignment should match open parenthesis
> 2.patch:348: CHECK: Alignment should match open parenthesis
> 2.patch:362: CHECK: Alignment should match open parenthesis
> 2.patch:371: CHECK: Alignment should match open parenthesis
> 2.patch:381: CHECK: Alignment should match open parenthesis
> 2.patch:404: CHECK: No space is necessary after a cast
> 2.patch:426: WARNING: quoted string split across lines
> 2.patch:437: WARNING: quoted string split across lines
> 2.patch:438: WARNING: quoted string split across lines
> 2.patch:458: CHECK: No space is necessary after a cast
> 2.patch:459: CHECK: Alignment should match open parenthesis
> 2.patch:464: CHECK: No space is necessary after a cast
> 2.patch:465: CHECK: Alignment should match open parenthesis
> 2.patch:472: CHECK: Alignment should match open parenthesis
> 2.patch:472: CHECK: No space is necessary after a cast
> 2.patch:482: CHECK: Alignment should match open parenthesis
> 2.patch:506: CHECK: Alignment should match open parenthesis
> 2.patch:513: CHECK: Alignment should match open parenthesis
> 2.patch:519: CHECK: Alignment should match open parenthesis
> 2.patch:535: CHECK: Alignment should match open parenthesis
> 2.patch:537: WARNING: quoted string split across lines
> 2.patch:538: WARNING: quoted string split across lines
> 2.patch:539: WARNING: quoted string split across lines
> 2.patch:549: CHECK: Alignment should match open parenthesis
> 2.patch:549: CHECK: No space is necessary after a cast
> 2.patch:565: CHECK: Alignment should match open parenthesis
> 2.patch:574: CHECK: Alignment should match open parenthesis
> 2.patch:595: CHECK: Alignment should match open parenthesis
> 2.patch:634: WARNING: quoted string split across lines
> 2.patch:639: CHECK: Alignment should match open parenthesis
> 2.patch:643: CHECK: Alignment should match open parenthesis
> 2.patch:646: CHECK: Alignment should match open parenthesis
> 2.patch:648: CHECK: Alignment should match open parenthesis
> 2.patch:650: CHECK: Alignment should match open parenthesis
> 2.patch:694: CHECK: braces {} should be used on all arms of this statement
> 2.patch:696: CHECK: Alignment should match open parenthesis
> 2.patch:703: CHECK: Unbalanced braces around else statement
> 2.patch:724: CHECK: Alignment should match open parenthesis
> 2.patch:744: CHECK: Alignment should match open parenthesis
> total: 0 errors, 10 warnings, 42 checks, 749 lines checked
>
> Subject: RE: [Patch v2 2/3] Drivers: hv: add Azure Blob driver
>
> From: Long Li <[email protected]> Sent: Wednesday, June 30, 2021 11:51
> PM
>
> [snip]
>
> > > > +static void az_blob_remove_device(struct az_blob_device *dev) {
> > > > + wait_event(dev->file_wait, list_empty(&dev->file_list));
> > > > + misc_deregister(&az_blob_misc_device);
> > > > +#ifdef CONFIG_DEBUG_FS
> > > > + debugfs_remove_recursive(az_blob_debugfs_root);
> > > > +#endif
> > > > + /* At this point, we won't get any requests from user-mode */ }
> > > > +
> > > > +static int az_blob_create_device(struct az_blob_device *dev) {
> > > > + int rc;
> > > > + struct dentry *d;
> > > > +
> > > > + rc = misc_register(&az_blob_misc_device);
> > > > + if (rc) {
> > > > + az_blob_err("misc_register failed rc %d\n", rc);
> > > > + return rc;
> > > > + }
> > > > +
> > > > +#ifdef CONFIG_DEBUG_FS
> > > > + az_blob_debugfs_root = debugfs_create_dir("az_blob", NULL);
> > > > + if (!IS_ERR_OR_NULL(az_blob_debugfs_root)) {
> > > > + d = debugfs_create_file("pending_requests", 0400,
> > > > + az_blob_debugfs_root, NULL,
> > > > + &az_blob_debugfs_fops);
> > > > + if (IS_ERR_OR_NULL(d)) {
> > > > + az_blob_warn("failed to create debugfs file\n");
> > > > + debugfs_remove_recursive(az_blob_debugfs_root);
> > > > + az_blob_debugfs_root = NULL;
> > > > + }
> > > > + } else
> > > > + az_blob_warn("failed to create debugfs root\n"); #endif
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static int az_blob_connect_to_vsp(struct hv_device *device, u32
> > > > +ring_size) {
> > > > + int ret;
> > > > +
> > > > + spin_lock_init(&az_blob_dev.file_lock);
> > >
> > > I'd argue that the spin lock should not be re-initialized here.
> > > Here's the sequence where things go wrong:
> > >
> > > 1) The driver is unbound, so az_blob_remove() is called.
> > > 2) az_blob_remove() sets the "removing" flag to true, and calls
> > > az_blob_remove_device().
> > > 3) az_blob_remove_device() waits for the file_list to become empty.
> > > 4) After the file_list becomes empty, but before misc_deregister()
> > > is called, a separate thread opens the device again.
> > > 5) In the separate thread, az_blob_fop_open() obtains the file_lock spin
> lock.
> > > 6) Before az_blob_fop_open() releases the spin lock,
> > > az_blob_remove_device() completes, along with az_blob_remove().
> > > 7) Then the device gets rebound, and az_blob_connect_to_vsp() gets
> > > called, all while az_blob_fop_open() still holds the spin lock. So
> > > the spin lock get re- initialized while it is held.
> > >
> > > This is admittedly a far-fetched scenario, but stranger things have
> > > happened. :-) The issue is that you are counting on the az_blob_dev
> > > structure to persist and have a valid file_lock, even while the
> > > device is unbound. So any initialization should only happen in
> az_blob_drv_init().
> >
> > I'm not sure if az_blob_probe() and az_blob_remove() can be called at
> > the same time, as az_blob_remove_vmbus() is called the last in
> az_blob_remove().
> > Is it possible for vmbus asking the driver to probe a new channel
> > before the old channel is closed? I expect the vmbus provide guarantee
> > that those calls are made in sequence.
>
> In my scenario above, az_blob_remove_vmbus() and az_blob_remove() run
> to completion in Step #6, all while some other thread is still in the middle of
> an
> open() call and holding the file_lock spin lock. Then in Step #7
> az_blob_probe() runs. So az_blob_remove() and az_blob_probe() execute
> sequentially, not at the same time.
>
> Michael
I think it's a valid scenario. The return value of devtmpfs_delete_node() is not checked in misc_deregister(). It decreases the refcount on inodes but it's not guaranteed that someone else is still using it (in the middle of opening a file).
However, this works fine for "rmmod" that causes device to be removed. Before file is opened the refcount on the module is increased so it can't be removed when file is being opened. The scenario you described can't happen.
But during VMBUS rescind, it can happen. It's possible that the driver is using the spinlock that has been re-initialized, when the next VMBUS offer on the same channel comes before all the attempting open file calls exit.
This is a very rare. I agree things happen that we should make sure the driver can handle this. I'll update the driver.
Long
>
> >
> > >
> > > > + INIT_LIST_HEAD(&az_blob_dev.file_list);
> > > > + init_waitqueue_head(&az_blob_dev.file_wait);
> > > > +
> > > > + az_blob_dev.removing = false;
> > > > +
> > > > + az_blob_dev.device = device;
> > > > + device->channel->rqstor_size = device_queue_depth;
> > > > +
> > > > + ret = vmbus_open(device->channel, ring_size, ring_size, NULL, 0,
> > > > + az_blob_on_channel_callback, device->channel);
> > > > +
> > > > + if (ret) {
> > > > + az_blob_err("failed to connect to VSP ret %d\n", ret);
> > > > + return ret;
> > > > + }
> > > > +
> > > > + hv_set_drvdata(device, &az_blob_dev);
> > > > +
> > > > + return ret;
> > > > +}
> > > > +
> > > > +static void az_blob_remove_vmbus(struct hv_device *device) {
> > > > + /* At this point, no VSC/VSP traffic is possible over vmbus */
> > > > + hv_set_drvdata(device, NULL);
> > > > + vmbus_close(device->channel);
> > > > +}
> > > > +
> > > > +static int az_blob_probe(struct hv_device *device,
> > > > + const struct hv_vmbus_device_id *dev_id) {
> > > > + int rc;
> > > > +
> > > > + az_blob_dbg("probing device\n");
> > > > +
> > > > + rc = az_blob_connect_to_vsp(device, az_blob_ringbuffer_size);
> > > > + if (rc) {
> > > > + az_blob_err("error connecting to VSP rc %d\n", rc);
> > > > + return rc;
> > > > + }
> > > > +
> > > > + // create user-mode client library facing device
> > > > + rc = az_blob_create_device(&az_blob_dev);
> > > > + if (rc) {
> > > > + az_blob_remove_vmbus(device);
> > > > + return rc;
> > > > + }
> > > > +
> > > > + az_blob_dbg("successfully probed device\n");
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static int az_blob_remove(struct hv_device *dev) {
> > > > + struct az_blob_device *device = hv_get_drvdata(dev);
> > > > + unsigned long flags;
> > > > +
> > > > + spin_lock_irqsave(&device->file_lock, flags);
> > > > + device->removing = true;
> > > > + spin_unlock_irqrestore(&device->file_lock, flags);
> > > > +
> > > > + az_blob_remove_device(device);
> > > > + az_blob_remove_vmbus(dev);
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static struct hv_driver az_blob_drv = {
> > > > + .name = KBUILD_MODNAME,
> > > > + .id_table = id_table,
> > > > + .probe = az_blob_probe,
> > > > + .remove = az_blob_remove,
> > > > + .driver = {
> > > > + .probe_type = PROBE_PREFER_ASYNCHRONOUS,
> > > > + },
> > > > +};
> > > > +
> > > > +static int __init az_blob_drv_init(void) {
> > > > + int ret;
> > > > +
> > > > + ret = vmbus_driver_register(&az_blob_drv);
> > > > + return ret;
> > > > +}
> > > > +
> > > > +static void __exit az_blob_drv_exit(void) {
> > > > + vmbus_driver_unregister(&az_blob_drv);
> > > > +}
From: Long Li <[email protected]> Sent: Friday, July 2, 2021 4:59 PM
> >
> > [snip]
> >
> > > > > +static void az_blob_remove_device(struct az_blob_device *dev) {
> > > > > + wait_event(dev->file_wait, list_empty(&dev->file_list));
> > > > > + misc_deregister(&az_blob_misc_device);
> > > > > +#ifdef CONFIG_DEBUG_FS
> > > > > + debugfs_remove_recursive(az_blob_debugfs_root);
> > > > > +#endif
> > > > > + /* At this point, we won't get any requests from user-mode */ }
> > > > > +
> > > > > +static int az_blob_create_device(struct az_blob_device *dev) {
> > > > > + int rc;
> > > > > + struct dentry *d;
> > > > > +
> > > > > + rc = misc_register(&az_blob_misc_device);
> > > > > + if (rc) {
> > > > > + az_blob_err("misc_register failed rc %d\n", rc);
> > > > > + return rc;
> > > > > + }
> > > > > +
> > > > > +#ifdef CONFIG_DEBUG_FS
> > > > > + az_blob_debugfs_root = debugfs_create_dir("az_blob", NULL);
> > > > > + if (!IS_ERR_OR_NULL(az_blob_debugfs_root)) {
> > > > > + d = debugfs_create_file("pending_requests", 0400,
> > > > > + az_blob_debugfs_root, NULL,
> > > > > + &az_blob_debugfs_fops);
> > > > > + if (IS_ERR_OR_NULL(d)) {
> > > > > + az_blob_warn("failed to create debugfs file\n");
> > > > > + debugfs_remove_recursive(az_blob_debugfs_root);
> > > > > + az_blob_debugfs_root = NULL;
> > > > > + }
> > > > > + } else
> > > > > + az_blob_warn("failed to create debugfs root\n"); #endif
> > > > > +
> > > > > + return 0;
> > > > > +}
> > > > > +
> > > > > +static int az_blob_connect_to_vsp(struct hv_device *device, u32
> > > > > +ring_size) {
> > > > > + int ret;
> > > > > +
> > > > > + spin_lock_init(&az_blob_dev.file_lock);
> > > >
> > > > I'd argue that the spin lock should not be re-initialized here.
> > > > Here's the sequence where things go wrong:
> > > >
> > > > 1) The driver is unbound, so az_blob_remove() is called.
> > > > 2) az_blob_remove() sets the "removing" flag to true, and calls
> > > > az_blob_remove_device().
> > > > 3) az_blob_remove_device() waits for the file_list to become empty.
> > > > 4) After the file_list becomes empty, but before misc_deregister()
> > > > is called, a separate thread opens the device again.
> > > > 5) In the separate thread, az_blob_fop_open() obtains the file_lock spin
> > lock.
> > > > 6) Before az_blob_fop_open() releases the spin lock,
> > > > az_blob_remove_device() completes, along with az_blob_remove().
> > > > 7) Then the device gets rebound, and az_blob_connect_to_vsp() gets
> > > > called, all while az_blob_fop_open() still holds the spin lock. So
> > > > the spin lock get re- initialized while it is held.
> > > >
> > > > This is admittedly a far-fetched scenario, but stranger things have
> > > > happened. :-) The issue is that you are counting on the az_blob_dev
> > > > structure to persist and have a valid file_lock, even while the
> > > > device is unbound. So any initialization should only happen in
> > az_blob_drv_init().
> > >
> > > I'm not sure if az_blob_probe() and az_blob_remove() can be called at
> > > the same time, as az_blob_remove_vmbus() is called the last in
> > az_blob_remove().
> > > Is it possible for vmbus asking the driver to probe a new channel
> > > before the old channel is closed? I expect the vmbus provide guarantee
> > > that those calls are made in sequence.
> >
> > In my scenario above, az_blob_remove_vmbus() and az_blob_remove() run
> > to completion in Step #6, all while some other thread is still in the middle of
> > an
> > open() call and holding the file_lock spin lock. Then in Step #7
> > az_blob_probe() runs. So az_blob_remove() and az_blob_probe() execute
> > sequentially, not at the same time.
> >
> > Michael
>
> I think it's a valid scenario. The return value of devtmpfs_delete_node() is
> not checked in misc_deregister(). It decreases the refcount on inodes but it's
> not guaranteed that someone else is still using it (in the middle of opening a file).
>
> However, this works fine for "rmmod" that causes device to be removed.
> Before file is opened the refcount on the module is increased so it can't be
> removed when file is being opened. The scenario you described can't happen.
>
> But during VMBUS rescind, it can happen. It's possible that the driver is using
> the spinlock that has been re-initialized, when the next VMBUS offer on the
> same channel comes before all the attempting open file calls exit.
In my scenario, Step #1 is an unbind operation, not a module removal. But
you make a valid point about VMbus rescind, which has the same effect as
unbind.
>
> This is a very rare. I agree things happen that we should make sure the driver
> can handle this. I'll update the driver.
Sounds good.
Michael
From: Long Li <[email protected]> Sent: Friday, July 2, 2021 4:59 PM
> > PM
> >
> > [snip]
> >
> > > > > +static void az_blob_remove_device(struct az_blob_device *dev) {
> > > > > + wait_event(dev->file_wait, list_empty(&dev->file_list));
> > > > > + misc_deregister(&az_blob_misc_device);
> > > > > +#ifdef CONFIG_DEBUG_FS
> > > > > + debugfs_remove_recursive(az_blob_debugfs_root);
> > > > > +#endif
> > > > > + /* At this point, we won't get any requests from user-mode */ }
> > > > > +
> > > > > +static int az_blob_create_device(struct az_blob_device *dev) {
> > > > > + int rc;
> > > > > + struct dentry *d;
> > > > > +
> > > > > + rc = misc_register(&az_blob_misc_device);
> > > > > + if (rc) {
> > > > > + az_blob_err("misc_register failed rc %d\n", rc);
> > > > > + return rc;
> > > > > + }
> > > > > +
> > > > > +#ifdef CONFIG_DEBUG_FS
> > > > > + az_blob_debugfs_root = debugfs_create_dir("az_blob", NULL);
> > > > > + if (!IS_ERR_OR_NULL(az_blob_debugfs_root)) {
> > > > > + d = debugfs_create_file("pending_requests", 0400,
> > > > > + az_blob_debugfs_root, NULL,
> > > > > + &az_blob_debugfs_fops);
> > > > > + if (IS_ERR_OR_NULL(d)) {
> > > > > + az_blob_warn("failed to create debugfs file\n");
> > > > > + debugfs_remove_recursive(az_blob_debugfs_root);
> > > > > + az_blob_debugfs_root = NULL;
> > > > > + }
> > > > > + } else
> > > > > + az_blob_warn("failed to create debugfs root\n"); #endif
> > > > > +
> > > > > + return 0;
> > > > > +}
> > > > > +
> > > > > +static int az_blob_connect_to_vsp(struct hv_device *device, u32
> > > > > +ring_size) {
> > > > > + int ret;
> > > > > +
> > > > > + spin_lock_init(&az_blob_dev.file_lock);
> > > >
> > > > I'd argue that the spin lock should not be re-initialized here.
> > > > Here's the sequence where things go wrong:
> > > >
> > > > 1) The driver is unbound, so az_blob_remove() is called.
> > > > 2) az_blob_remove() sets the "removing" flag to true, and calls
> > > > az_blob_remove_device().
> > > > 3) az_blob_remove_device() waits for the file_list to become empty.
> > > > 4) After the file_list becomes empty, but before misc_deregister()
> > > > is called, a separate thread opens the device again.
> > > > 5) In the separate thread, az_blob_fop_open() obtains the file_lock spin
> > lock.
> > > > 6) Before az_blob_fop_open() releases the spin lock,
> > > > az_blob_remove_device() completes, along with az_blob_remove().
> > > > 7) Then the device gets rebound, and az_blob_connect_to_vsp() gets
> > > > called, all while az_blob_fop_open() still holds the spin lock. So
> > > > the spin lock get re- initialized while it is held.
> > > >
> > > > This is admittedly a far-fetched scenario, but stranger things have
> > > > happened. :-) The issue is that you are counting on the az_blob_dev
> > > > structure to persist and have a valid file_lock, even while the
> > > > device is unbound. So any initialization should only happen in
> > az_blob_drv_init().
> > >
> > > I'm not sure if az_blob_probe() and az_blob_remove() can be called at
> > > the same time, as az_blob_remove_vmbus() is called the last in
> > az_blob_remove().
> > > Is it possible for vmbus asking the driver to probe a new channel
> > > before the old channel is closed? I expect the vmbus provide guarantee
> > > that those calls are made in sequence.
> >
> > In my scenario above, az_blob_remove_vmbus() and az_blob_remove() run
> > to completion in Step #6, all while some other thread is still in the middle of
> > an
> > open() call and holding the file_lock spin lock. Then in Step #7
> > az_blob_probe() runs. So az_blob_remove() and az_blob_probe() execute
> > sequentially, not at the same time.
> >
> > Michael
>
> I think it's a valid scenario. The return value of devtmpfs_delete_node() is not checked in misc_deregister(). It decreases
> the refcount on inodes but it's not guaranteed that someone else is still using it (in the middle of opening a file).
>
> However, this works fine for "rmmod" that causes device to be removed. Before file is opened the refcount on the module
> is increased so it can't be removed when file is being opened. The scenario you described can't happen.
>
> But during VMBUS rescind, it can happen. It's possible that the driver is using the spinlock that has been re-initialized, when
> the next VMBUS offer on the same channel comes before all the attempting open file calls exit.
I was thinking about the rescind scenario. vmbus_onoffer_rescind()
will run on the global workqueue. If it eventually calls az_blob_remove()
and then az_blob_remove_device(), it will wait until the file_list is
empty, which essentially means waiting until user space processes
decide to close the instances they have open. This seems like a
problem that could block the global workqueue for a long time and
thereby hang the kernel. Is my reasoning valid? If so, I haven't
thought about what the solution might be. It seems like we do need
to wait until any in-progress requests to Hyper-V are complete because
Hyper-V has references to guest physical memory. But waiting for
all open instances to be closed seems to be problematic.
Michael
>
> This is a very rare. I agree things happen that we should make sure the driver can handle this. I'll update the driver.
>
> Long
>
> >
> > >
> > > >
> > > > > + INIT_LIST_HEAD(&az_blob_dev.file_list);
> > > > > + init_waitqueue_head(&az_blob_dev.file_wait);
> > > > > +
> > > > > + az_blob_dev.removing = false;
> > > > > +
> > > > > + az_blob_dev.device = device;
> > > > > + device->channel->rqstor_size = device_queue_depth;
> > > > > +
> > > > > + ret = vmbus_open(device->channel, ring_size, ring_size, NULL, 0,
> > > > > + az_blob_on_channel_callback, device->channel);
> > > > > +
> > > > > + if (ret) {
> > > > > + az_blob_err("failed to connect to VSP ret %d\n", ret);
> > > > > + return ret;
> > > > > + }
> > > > > +
> > > > > + hv_set_drvdata(device, &az_blob_dev);
> > > > > +
> > > > > + return ret;
> > > > > +}
> > > > > +
> > > > > +static void az_blob_remove_vmbus(struct hv_device *device) {
> > > > > + /* At this point, no VSC/VSP traffic is possible over vmbus */
> > > > > + hv_set_drvdata(device, NULL);
> > > > > + vmbus_close(device->channel);
> > > > > +}
> > > > > +
> > > > > +static int az_blob_probe(struct hv_device *device,
> > > > > + const struct hv_vmbus_device_id *dev_id) {
> > > > > + int rc;
> > > > > +
> > > > > + az_blob_dbg("probing device\n");
> > > > > +
> > > > > + rc = az_blob_connect_to_vsp(device, az_blob_ringbuffer_size);
> > > > > + if (rc) {
> > > > > + az_blob_err("error connecting to VSP rc %d\n", rc);
> > > > > + return rc;
> > > > > + }
> > > > > +
> > > > > + // create user-mode client library facing device
> > > > > + rc = az_blob_create_device(&az_blob_dev);
> > > > > + if (rc) {
> > > > > + az_blob_remove_vmbus(device);
> > > > > + return rc;
> > > > > + }
> > > > > +
> > > > > + az_blob_dbg("successfully probed device\n");
> > > > > + return 0;
> > > > > +}
> > > > > +
> > > > > +static int az_blob_remove(struct hv_device *dev) {
> > > > > + struct az_blob_device *device = hv_get_drvdata(dev);
> > > > > + unsigned long flags;
> > > > > +
> > > > > + spin_lock_irqsave(&device->file_lock, flags);
> > > > > + device->removing = true;
> > > > > + spin_unlock_irqrestore(&device->file_lock, flags);
> > > > > +
> > > > > + az_blob_remove_device(device);
> > > > > + az_blob_remove_vmbus(dev);
> > > > > + return 0;
> > > > > +}
> > > > > +
> > > > > +static struct hv_driver az_blob_drv = {
> > > > > + .name = KBUILD_MODNAME,
> > > > > + .id_table = id_table,
> > > > > + .probe = az_blob_probe,
> > > > > + .remove = az_blob_remove,
> > > > > + .driver = {
> > > > > + .probe_type = PROBE_PREFER_ASYNCHRONOUS,
> > > > > + },
> > > > > +};
> > > > > +
> > > > > +static int __init az_blob_drv_init(void) {
> > > > > + int ret;
> > > > > +
> > > > > + ret = vmbus_driver_register(&az_blob_drv);
> > > > > + return ret;
> > > > > +}
> > > > > +
> > > > > +static void __exit az_blob_drv_exit(void) {
> > > > > + vmbus_driver_unregister(&az_blob_drv);
> > > > > +}
> Subject: RE: [Patch v2 2/3] Drivers: hv: add Azure Blob driver
>
> From: Long Li <[email protected]> Sent: Friday, July 2, 2021 4:59 PM
> > > PM
> > >
> > > [snip]
> > >
> > > > > > +static void az_blob_remove_device(struct az_blob_device *dev) {
> > > > > > + wait_event(dev->file_wait, list_empty(&dev->file_list));
> > > > > > + misc_deregister(&az_blob_misc_device);
> > > > > > +#ifdef CONFIG_DEBUG_FS
> > > > > > + debugfs_remove_recursive(az_blob_debugfs_root);
> > > > > > +#endif
> > > > > > + /* At this point, we won't get any requests from user-mode
> > > > > > +*/ }
> > > > > > +
> > > > > > +static int az_blob_create_device(struct az_blob_device *dev) {
> > > > > > + int rc;
> > > > > > + struct dentry *d;
> > > > > > +
> > > > > > + rc = misc_register(&az_blob_misc_device);
> > > > > > + if (rc) {
> > > > > > + az_blob_err("misc_register failed rc %d\n", rc);
> > > > > > + return rc;
> > > > > > + }
> > > > > > +
> > > > > > +#ifdef CONFIG_DEBUG_FS
> > > > > > + az_blob_debugfs_root = debugfs_create_dir("az_blob",
> NULL);
> > > > > > + if (!IS_ERR_OR_NULL(az_blob_debugfs_root)) {
> > > > > > + d = debugfs_create_file("pending_requests", 0400,
> > > > > > + az_blob_debugfs_root, NULL,
> > > > > > + &az_blob_debugfs_fops);
> > > > > > + if (IS_ERR_OR_NULL(d)) {
> > > > > > + az_blob_warn("failed to create debugfs
> file\n");
> > > > > > +
> debugfs_remove_recursive(az_blob_debugfs_root);
> > > > > > + az_blob_debugfs_root = NULL;
> > > > > > + }
> > > > > > + } else
> > > > > > + az_blob_warn("failed to create debugfs root\n");
> #endif
> > > > > > +
> > > > > > + return 0;
> > > > > > +}
> > > > > > +
> > > > > > +static int az_blob_connect_to_vsp(struct hv_device *device,
> > > > > > +u32
> > > > > > +ring_size) {
> > > > > > + int ret;
> > > > > > +
> > > > > > + spin_lock_init(&az_blob_dev.file_lock);
> > > > >
> > > > > I'd argue that the spin lock should not be re-initialized here.
> > > > > Here's the sequence where things go wrong:
> > > > >
> > > > > 1) The driver is unbound, so az_blob_remove() is called.
> > > > > 2) az_blob_remove() sets the "removing" flag to true, and calls
> > > > > az_blob_remove_device().
> > > > > 3) az_blob_remove_device() waits for the file_list to become empty.
> > > > > 4) After the file_list becomes empty, but before
> > > > > misc_deregister() is called, a separate thread opens the device again.
> > > > > 5) In the separate thread, az_blob_fop_open() obtains the
> > > > > file_lock spin
> > > lock.
> > > > > 6) Before az_blob_fop_open() releases the spin lock,
> > > > > az_blob_remove_device() completes, along with az_blob_remove().
> > > > > 7) Then the device gets rebound, and az_blob_connect_to_vsp()
> > > > > gets called, all while az_blob_fop_open() still holds the spin
> > > > > lock. So the spin lock get re- initialized while it is held.
> > > > >
> > > > > This is admittedly a far-fetched scenario, but stranger things
> > > > > have happened. :-) The issue is that you are counting on the
> > > > > az_blob_dev structure to persist and have a valid file_lock,
> > > > > even while the device is unbound. So any initialization should
> > > > > only happen in
> > > az_blob_drv_init().
> > > >
> > > > I'm not sure if az_blob_probe() and az_blob_remove() can be called
> > > > at the same time, as az_blob_remove_vmbus() is called the last in
> > > az_blob_remove().
> > > > Is it possible for vmbus asking the driver to probe a new channel
> > > > before the old channel is closed? I expect the vmbus provide
> > > > guarantee that those calls are made in sequence.
> > >
> > > In my scenario above, az_blob_remove_vmbus() and az_blob_remove()
> > > run to completion in Step #6, all while some other thread is still
> > > in the middle of an
> > > open() call and holding the file_lock spin lock. Then in Step #7
> > > az_blob_probe() runs. So az_blob_remove() and az_blob_probe()
> > > execute sequentially, not at the same time.
> > >
> > > Michael
> >
> > I think it's a valid scenario. The return value of
> > devtmpfs_delete_node() is not checked in misc_deregister(). It decreases
> the refcount on inodes but it's not guaranteed that someone else is still using
> it (in the middle of opening a file).
> >
> > However, this works fine for "rmmod" that causes device to be removed.
> > Before file is opened the refcount on the module is increased so it can't be
> removed when file is being opened. The scenario you described can't
> happen.
> >
> > But during VMBUS rescind, it can happen. It's possible that the driver
> > is using the spinlock that has been re-initialized, when the next VMBUS
> offer on the same channel comes before all the attempting open file calls
> exit.
>
> I was thinking about the rescind scenario. vmbus_onoffer_rescind() will run
> on the global workqueue. If it eventually calls az_blob_remove() and then
> az_blob_remove_device(), it will wait until the file_list is empty, which
> essentially means waiting until user space processes decide to close the
> instances they have open. This seems like a problem that could block the
> global workqueue for a long time and
> thereby hang the kernel. Is my reasoning valid? If so, I haven't
> thought about what the solution might be. It seems like we do need to wait
> until any in-progress requests to Hyper-V are complete because Hyper-V has
> references to guest physical memory. But waiting for all open instances to
> be closed seems to be problematic.
My tests showed that misc_deregister() caused all opened files to be released if there are no pending I/O waiting in the driver.
If there are pending I/O, we must wait as the VSP owns the memory of the I/O. The correct VSP behavior is to return all the pending I/O along with rescind. This is the same to what storvsc does for rescind.
It looks to me waiting for opened files after the call to misc_deregister(), but before removing the vmbus channel is a safe approach.
If the VSP is behaving correctly, the rescind process should not block for too long. If we want to deal with a buggy VSP that takes forever to release a resource, we want to create a work queue for rescind handling.
>
> Michael
>
> >
> > This is a very rare. I agree things happen that we should make sure the
> driver can handle this. I'll update the driver.
> >
> > Long
> >
> > >
> > > >
> > > > >
> > > > > > + INIT_LIST_HEAD(&az_blob_dev.file_list);
> > > > > > + init_waitqueue_head(&az_blob_dev.file_wait);
> > > > > > +
> > > > > > + az_blob_dev.removing = false;
> > > > > > +
> > > > > > + az_blob_dev.device = device;
> > > > > > + device->channel->rqstor_size = device_queue_depth;
> > > > > > +
> > > > > > + ret = vmbus_open(device->channel, ring_size, ring_size,
> NULL, 0,
> > > > > > + az_blob_on_channel_callback, device-
> >channel);
> > > > > > +
> > > > > > + if (ret) {
> > > > > > + az_blob_err("failed to connect to VSP ret %d\n", ret);
> > > > > > + return ret;
> > > > > > + }
> > > > > > +
> > > > > > + hv_set_drvdata(device, &az_blob_dev);
> > > > > > +
> > > > > > + return ret;
> > > > > > +}
> > > > > > +
> > > > > > +static void az_blob_remove_vmbus(struct hv_device *device) {
> > > > > > + /* At this point, no VSC/VSP traffic is possible over vmbus */
> > > > > > + hv_set_drvdata(device, NULL);
> > > > > > + vmbus_close(device->channel); }
> > > > > > +
> > > > > > +static int az_blob_probe(struct hv_device *device,
> > > > > > + const struct hv_vmbus_device_id *dev_id) {
> > > > > > + int rc;
> > > > > > +
> > > > > > + az_blob_dbg("probing device\n");
> > > > > > +
> > > > > > + rc = az_blob_connect_to_vsp(device,
> az_blob_ringbuffer_size);
> > > > > > + if (rc) {
> > > > > > + az_blob_err("error connecting to VSP rc %d\n", rc);
> > > > > > + return rc;
> > > > > > + }
> > > > > > +
> > > > > > + // create user-mode client library facing device
> > > > > > + rc = az_blob_create_device(&az_blob_dev);
> > > > > > + if (rc) {
> > > > > > + az_blob_remove_vmbus(device);
> > > > > > + return rc;
> > > > > > + }
> > > > > > +
> > > > > > + az_blob_dbg("successfully probed device\n");
> > > > > > + return 0;
> > > > > > +}
> > > > > > +
> > > > > > +static int az_blob_remove(struct hv_device *dev) {
> > > > > > + struct az_blob_device *device = hv_get_drvdata(dev);
> > > > > > + unsigned long flags;
> > > > > > +
> > > > > > + spin_lock_irqsave(&device->file_lock, flags);
> > > > > > + device->removing = true;
> > > > > > + spin_unlock_irqrestore(&device->file_lock, flags);
> > > > > > +
> > > > > > + az_blob_remove_device(device);
> > > > > > + az_blob_remove_vmbus(dev);
> > > > > > + return 0;
> > > > > > +}
> > > > > > +
> > > > > > +static struct hv_driver az_blob_drv = {
> > > > > > + .name = KBUILD_MODNAME,
> > > > > > + .id_table = id_table,
> > > > > > + .probe = az_blob_probe,
> > > > > > + .remove = az_blob_remove,
> > > > > > + .driver = {
> > > > > > + .probe_type = PROBE_PREFER_ASYNCHRONOUS,
> > > > > > + },
> > > > > > +};
> > > > > > +
> > > > > > +static int __init az_blob_drv_init(void) {
> > > > > > + int ret;
> > > > > > +
> > > > > > + ret = vmbus_driver_register(&az_blob_drv);
> > > > > > + return ret;
> > > > > > +}
> > > > > > +
> > > > > > +static void __exit az_blob_drv_exit(void) {
> > > > > > + vmbus_driver_unregister(&az_blob_drv);
> > > > > > +}
From: Long Li <[email protected]> Sent: Friday, July 9, 2021 12:44 PM
> >
> > From: Long Li <[email protected]> Sent: Friday, July 2, 2021 4:59 PM
> > > > PM
> > > >
> > > > [snip]
> > > >
> > > > > > > +static void az_blob_remove_device(struct az_blob_device *dev) {
> > > > > > > + wait_event(dev->file_wait, list_empty(&dev->file_list));
> > > > > > > + misc_deregister(&az_blob_misc_device);
> > > > > > > +#ifdef CONFIG_DEBUG_FS
> > > > > > > + debugfs_remove_recursive(az_blob_debugfs_root);
> > > > > > > +#endif
> > > > > > > + /* At this point, we won't get any requests from user-mode
> > > > > > > +*/ }
> > > > > > > +
> > > > > > > +static int az_blob_create_device(struct az_blob_device *dev) {
> > > > > > > + int rc;
> > > > > > > + struct dentry *d;
> > > > > > > +
> > > > > > > + rc = misc_register(&az_blob_misc_device);
> > > > > > > + if (rc) {
> > > > > > > + az_blob_err("misc_register failed rc %d\n", rc);
> > > > > > > + return rc;
> > > > > > > + }
> > > > > > > +
> > > > > > > +#ifdef CONFIG_DEBUG_FS
> > > > > > > + az_blob_debugfs_root = debugfs_create_dir("az_blob",
> > NULL);
> > > > > > > + if (!IS_ERR_OR_NULL(az_blob_debugfs_root)) {
> > > > > > > + d = debugfs_create_file("pending_requests", 0400,
> > > > > > > + az_blob_debugfs_root, NULL,
> > > > > > > + &az_blob_debugfs_fops);
> > > > > > > + if (IS_ERR_OR_NULL(d)) {
> > > > > > > + az_blob_warn("failed to create debugfs
> > file\n");
> > > > > > > +
> > debugfs_remove_recursive(az_blob_debugfs_root);
> > > > > > > + az_blob_debugfs_root = NULL;
> > > > > > > + }
> > > > > > > + } else
> > > > > > > + az_blob_warn("failed to create debugfs root\n");
> > #endif
> > > > > > > +
> > > > > > > + return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int az_blob_connect_to_vsp(struct hv_device *device,
> > > > > > > +u32
> > > > > > > +ring_size) {
> > > > > > > + int ret;
> > > > > > > +
> > > > > > > + spin_lock_init(&az_blob_dev.file_lock);
> > > > > >
> > > > > > I'd argue that the spin lock should not be re-initialized here.
> > > > > > Here's the sequence where things go wrong:
> > > > > >
> > > > > > 1) The driver is unbound, so az_blob_remove() is called.
> > > > > > 2) az_blob_remove() sets the "removing" flag to true, and calls
> > > > > > az_blob_remove_device().
> > > > > > 3) az_blob_remove_device() waits for the file_list to become empty.
> > > > > > 4) After the file_list becomes empty, but before
> > > > > > misc_deregister() is called, a separate thread opens the device again.
> > > > > > 5) In the separate thread, az_blob_fop_open() obtains the
> > > > > > file_lock spin
> > > > lock.
> > > > > > 6) Before az_blob_fop_open() releases the spin lock,
> > > > > > az_blob_remove_device() completes, along with az_blob_remove().
> > > > > > 7) Then the device gets rebound, and az_blob_connect_to_vsp()
> > > > > > gets called, all while az_blob_fop_open() still holds the spin
> > > > > > lock. So the spin lock get re- initialized while it is held.
> > > > > >
> > > > > > This is admittedly a far-fetched scenario, but stranger things
> > > > > > have happened. :-) The issue is that you are counting on the
> > > > > > az_blob_dev structure to persist and have a valid file_lock,
> > > > > > even while the device is unbound. So any initialization should
> > > > > > only happen in
> > > > az_blob_drv_init().
> > > > >
> > > > > I'm not sure if az_blob_probe() and az_blob_remove() can be called
> > > > > at the same time, as az_blob_remove_vmbus() is called the last in
> > > > az_blob_remove().
> > > > > Is it possible for vmbus asking the driver to probe a new channel
> > > > > before the old channel is closed? I expect the vmbus provide
> > > > > guarantee that those calls are made in sequence.
> > > >
> > > > In my scenario above, az_blob_remove_vmbus() and az_blob_remove()
> > > > run to completion in Step #6, all while some other thread is still
> > > > in the middle of an
> > > > open() call and holding the file_lock spin lock. Then in Step #7
> > > > az_blob_probe() runs. So az_blob_remove() and az_blob_probe()
> > > > execute sequentially, not at the same time.
> > > >
> > > > Michael
> > >
> > > I think it's a valid scenario. The return value of
> > > devtmpfs_delete_node() is not checked in misc_deregister(). It decreases
> > the refcount on inodes but it's not guaranteed that someone else is still using
> > it (in the middle of opening a file).
> > >
> > > However, this works fine for "rmmod" that causes device to be removed.
> > > Before file is opened the refcount on the module is increased so it can't be
> > removed when file is being opened. The scenario you described can't
> > happen.
> > >
> > > But during VMBUS rescind, it can happen. It's possible that the driver
> > > is using the spinlock that has been re-initialized, when the next VMBUS
> > offer on the same channel comes before all the attempting open file calls
> > exit.
> >
> > I was thinking about the rescind scenario. vmbus_onoffer_rescind() will run
> > on the global workqueue. If it eventually calls az_blob_remove() and then
> > az_blob_remove_device(), it will wait until the file_list is empty, which
> > essentially means waiting until user space processes decide to close the
> > instances they have open. This seems like a problem that could block the
> > global workqueue for a long time and
> > thereby hang the kernel. Is my reasoning valid? If so, I haven't
> > thought about what the solution might be. It seems like we do need to wait
> > until any in-progress requests to Hyper-V are complete because Hyper-V has
> > references to guest physical memory. But waiting for all open instances to
> > be closed seems to be problematic.
>
> My tests showed that misc_deregister() caused all opened files to be released if there are no pending I/O waiting in the
> driver.
>
> If there are pending I/O, we must wait as the VSP owns the memory of the I/O. The correct VSP behavior is to return all the
> pending I/O along with rescind. This is the same to what storvsc does for rescind.
Yes, we have to wait since the VSP owns the memory. But you make
a good point that the VSP behavior should be to return all the pending
I/Os at roughly the same time as the rescind.
>
> It looks to me waiting for opened files after the call to misc_deregister(), but before removing the vmbus channel is a safe
> approach.
To me, this would be a great solution. And it closes the original timing
window I pointed out where some other thread could open the device
after having waited for the file_list to be empty, but before
misc_deregister() is called. With the order swapped, once the file_list
is empty, there's no way for the device to be opened again. So it solves
the problem with the spin_lock initialization.
>
> If the VSP is behaving correctly, the rescind process should not block for too long. If we want to deal with a buggy VSP that
> takes forever to release a resource, we want to create a work queue for rescind handling.
I don't think we need to deal with a buggy VSP holding memory for
an excessively long time.
Michael
> > >
> > > This is a very rare. I agree things happen that we should make sure the
> > driver can handle this. I'll update the driver.
> > >
> > > Long
> > >
> > > >
> > > > >
> > > > > >
> > > > > > > + INIT_LIST_HEAD(&az_blob_dev.file_list);
> > > > > > > + init_waitqueue_head(&az_blob_dev.file_wait);
> > > > > > > +
> > > > > > > + az_blob_dev.removing = false;
> > > > > > > +
> > > > > > > + az_blob_dev.device = device;
> > > > > > > + device->channel->rqstor_size = device_queue_depth;
> > > > > > > +
> > > > > > > + ret = vmbus_open(device->channel, ring_size, ring_size,
> > NULL, 0,
> > > > > > > + az_blob_on_channel_callback, device-
> > >channel);
> > > > > > > +
> > > > > > > + if (ret) {
> > > > > > > + az_blob_err("failed to connect to VSP ret %d\n", ret);
> > > > > > > + return ret;
> > > > > > > + }
> > > > > > > +
> > > > > > > + hv_set_drvdata(device, &az_blob_dev);
> > > > > > > +
> > > > > > > + return ret;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void az_blob_remove_vmbus(struct hv_device *device) {
> > > > > > > + /* At this point, no VSC/VSP traffic is possible over vmbus */
> > > > > > > + hv_set_drvdata(device, NULL);
> > > > > > > + vmbus_close(device->channel); }
> > > > > > > +
> > > > > > > +static int az_blob_probe(struct hv_device *device,
> > > > > > > + const struct hv_vmbus_device_id *dev_id) {
> > > > > > > + int rc;
> > > > > > > +
> > > > > > > + az_blob_dbg("probing device\n");
> > > > > > > +
> > > > > > > + rc = az_blob_connect_to_vsp(device,
> > az_blob_ringbuffer_size);
> > > > > > > + if (rc) {
> > > > > > > + az_blob_err("error connecting to VSP rc %d\n", rc);
> > > > > > > + return rc;
> > > > > > > + }
> > > > > > > +
> > > > > > > + // create user-mode client library facing device
> > > > > > > + rc = az_blob_create_device(&az_blob_dev);
> > > > > > > + if (rc) {
> > > > > > > + az_blob_remove_vmbus(device);
> > > > > > > + return rc;
> > > > > > > + }
> > > > > > > +
> > > > > > > + az_blob_dbg("successfully probed device\n");
> > > > > > > + return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int az_blob_remove(struct hv_device *dev) {
> > > > > > > + struct az_blob_device *device = hv_get_drvdata(dev);
> > > > > > > + unsigned long flags;
> > > > > > > +
> > > > > > > + spin_lock_irqsave(&device->file_lock, flags);
> > > > > > > + device->removing = true;
> > > > > > > + spin_unlock_irqrestore(&device->file_lock, flags);
> > > > > > > +
> > > > > > > + az_blob_remove_device(device);
> > > > > > > + az_blob_remove_vmbus(dev);
> > > > > > > + return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static struct hv_driver az_blob_drv = {
> > > > > > > + .name = KBUILD_MODNAME,
> > > > > > > + .id_table = id_table,
> > > > > > > + .probe = az_blob_probe,
> > > > > > > + .remove = az_blob_remove,
> > > > > > > + .driver = {
> > > > > > > + .probe_type = PROBE_PREFER_ASYNCHRONOUS,
> > > > > > > + },
> > > > > > > +};
> > > > > > > +
> > > > > > > +static int __init az_blob_drv_init(void) {
> > > > > > > + int ret;
> > > > > > > +
> > > > > > > + ret = vmbus_driver_register(&az_blob_drv);
> > > > > > > + return ret;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void __exit az_blob_drv_exit(void) {
> > > > > > > + vmbus_driver_unregister(&az_blob_drv);
> > > > > > > +}