From: Long Li <[email protected]>
Microsoft Azure Blob storage service exposes a REST API to applications
for data access. While it's flexible and works on most platforms, it's
not as efficient as native network stack.
This patchset implements a VSC that communicates with a VSP on the host
to execute blob storage access via native network stack on the host.
Reference:
https://azure.microsoft.com/en-us/services/storage/blobs/#overview
Long Li (3):
Drivers: hv: vmbus: add support to ignore certain PCIE devices
Drivers: hv: add Azure Blob driver
Drivers: hv: Add to maintainer for Azure Blob driver
Changes:
v2:
Refactored the code in vmbus to scan devices
Reworked Azure Blob driver and moved user-mode interfaces to uapi
v3:
Changed licensing language
Patch format passed "checkpatch --strict"
debugfs and logging, module parameter cleanup
General code clean up
Fix device removal race conditions
Documentation/userspace-api/ioctl/ioctl-number.rst | 2 +
MAINTAINERS | 1 +
drivers/hv/Kconfig | 10 +
drivers/hv/Makefile | 1 +
drivers/hv/azure_blob.c | 625 +++++++++++++++++++++
drivers/hv/channel_mgmt.c | 55 +-
include/linux/hyperv.h | 9 +
include/uapi/misc/azure_blob.h | 34 ++
8 files changed, 731 insertions(+), 6 deletions(-)
create mode 100644 drivers/hv/azure_blob.c
create mode 100644 include/uapi/misc/azure_blob.h
--
1.8.3.1
From: Long Li <[email protected]>
Add [email protected] to maintainer list for Azure Blob driver.
Cc: K. Y. Srinivasan <[email protected]>
Cc: Haiyang Zhang <[email protected]>
Cc: Stephen Hemminger <[email protected]>
Cc: Wei Liu <[email protected]>
Cc: Dexuan Cui <[email protected]>
Signed-off-by: Long Li <[email protected]>
---
MAINTAINERS | 1 +
1 file changed, 1 insertion(+)
diff --git a/MAINTAINERS b/MAINTAINERS
index 9487061..b547eb9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8440,6 +8440,7 @@ M: Haiyang Zhang <[email protected]>
M: Stephen Hemminger <[email protected]>
M: Wei Liu <[email protected]>
M: Dexuan Cui <[email protected]>
+M: Long Li <[email protected]>
L: [email protected]
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git
--
1.8.3.1
From: Long Li <[email protected]>
Azure Blob storage provides scalable and durable data storage for Azure.
(https://azure.microsoft.com/en-us/services/storage/blobs/)
This driver adds support for accelerated access to Azure Blob storage. As an
alternative to REST APIs, it provides a fast data path that uses host native
network stack and secure direct data link for storage server access.
This driver will be ported to FreeBSD. It's dual licensed for BSD and GPL.
Cc: Jonathan Corbet <[email protected]>
Cc: "K. Y. Srinivasan" <[email protected]>
Cc: Haiyang Zhang <[email protected]>
Cc: Stephen Hemminger <[email protected]>
Cc: Wei Liu <[email protected]>
Cc: Dexuan Cui <[email protected]>
Cc: Greg Kroah-Hartman <[email protected]>
Cc: Bjorn Andersson <[email protected]>
Cc: Hans de Goede <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Maximilian Luz <[email protected]>
Cc: Mike Rapoport <[email protected]>
Cc: Ben Widawsky <[email protected]>
Cc: Jiri Slaby <[email protected]>
Cc: Andra Paraschiv <[email protected]>
Cc: Siddharth Gupta <[email protected]>
Cc: Hannes Reinecke <[email protected]>
Cc: [email protected]
Signed-off-by: Long Li <[email protected]>
---
Documentation/userspace-api/ioctl/ioctl-number.rst | 2 +
drivers/hv/Kconfig | 10 +
drivers/hv/Makefile | 1 +
drivers/hv/azure_blob.c | 625 +++++++++++++++++++++
drivers/hv/channel_mgmt.c | 7 +
include/linux/hyperv.h | 9 +
include/uapi/misc/azure_blob.h | 34 ++
7 files changed, 688 insertions(+)
create mode 100644 drivers/hv/azure_blob.c
create mode 100644 include/uapi/misc/azure_blob.h
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 9bfc2b5..d3c2a90 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -180,6 +180,8 @@ Code Seq# Include File Comments
'R' 01 linux/rfkill.h conflict!
'R' C0-DF net/bluetooth/rfcomm.h
'R' E0 uapi/linux/fsl_mc.h
+'R' F0-FF uapi/misc/azure_blob.h Microsoft Azure Blob driver
+ <mailto:[email protected]>
'S' all linux/cdrom.h conflict!
'S' 80-81 scsi/scsi_ioctl.h conflict!
'S' 82-FF scsi/scsi.h conflict!
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 66c794d..e08b8d3 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -27,4 +27,14 @@ config HYPERV_BALLOON
help
Select this option to enable Hyper-V Balloon driver.
+config HYPERV_AZURE_BLOB
+ tristate "Microsoft Azure Blob driver"
+ depends on HYPERV && X86_64
+ help
+ Select this option to enable Microsoft Azure Blob driver.
+
+ This driver supports accelerated Microsoft Azure Blob access.
+ To compile this driver as a module, choose M here. The module will be
+ called azure_blob.
+
endmenu
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index 94daf82..a322575 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -2,6 +2,7 @@
obj-$(CONFIG_HYPERV) += hv_vmbus.o
obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
+obj-$(CONFIG_HYPERV_AZURE_BLOB) += azure_blob.o
CFLAGS_hv_trace.o = -I$(src)
CFLAGS_hv_balloon.o = -I$(src)
diff --git a/drivers/hv/azure_blob.c b/drivers/hv/azure_blob.c
new file mode 100644
index 0000000..5367d5e
--- /dev/null
+++ b/drivers/hv/azure_blob.c
@@ -0,0 +1,625 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH Linux-syscall-note
+/* Copyright (c) Microsoft Corporation. */
+
+#include <uapi/misc/azure_blob.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/debugfs.h>
+#include <linux/pagemap.h>
+#include <linux/hyperv.h>
+#include <linux/miscdevice.h>
+#include <linux/uio.h>
+
+struct az_blob_device {
+ struct hv_device *device;
+
+ /* Opened files maintained by this device */
+ struct list_head file_list;
+ /* Lock for protecting file_list */
+ spinlock_t file_lock;
+ wait_queue_head_t file_wait;
+
+ bool removing;
+};
+
+/* VSP messages */
+enum az_blob_vsp_request_type {
+ AZ_BLOB_DRIVER_REQUEST_FIRST = 0x100,
+ AZ_BLOB_DRIVER_USER_REQUEST = 0x100,
+ AZ_BLOB_DRIVER_REGISTER_BUFFER = 0x101,
+ AZ_BLOB_DRIVER_DEREGISTER_BUFFER = 0x102,
+};
+
+/* VSC->VSP request */
+struct az_blob_vsp_request {
+ u32 version;
+ u32 timeout_ms;
+ u32 data_buffer_offset;
+ u32 data_buffer_length;
+ u32 data_buffer_valid;
+ u32 operation_type;
+ u32 request_buffer_offset;
+ u32 request_buffer_length;
+ u32 response_buffer_offset;
+ u32 response_buffer_length;
+ guid_t transaction_id;
+} __packed;
+
+/* VSP->VSC response */
+struct az_blob_vsp_response {
+ u32 length;
+ u32 error;
+ u32 response_len;
+} __packed;
+
+struct az_blob_vsp_request_ctx {
+ struct list_head list;
+ struct completion wait_vsp;
+ struct az_blob_request_sync *request;
+};
+
+struct az_blob_file_ctx {
+ struct list_head list;
+
+ /* List of pending requests to VSP */
+ struct list_head vsp_pending_requests;
+ /* Lock for protecting vsp_pending_requests */
+ spinlock_t vsp_pending_lock;
+ wait_queue_head_t wait_vsp_pending;
+};
+
+/* The maximum number of pages we can pass to VSP in a single packet */
+#define AZ_BLOB_MAX_PAGES 8192
+
+/* The one and only one */
+static struct az_blob_device az_blob_dev;
+
+/* Ring buffer size in bytes */
+#define AZ_BLOB_RING_SIZE (128 * 1024)
+
+/* System wide device queue depth */
+#define AZ_BLOB_QUEUE_DEPTH 1024
+
+static const struct hv_vmbus_device_id id_table[] = {
+ { HV_AZURE_BLOB_GUID,
+ .driver_data = 0
+ },
+ { },
+};
+
+#define az_blob_dbg(fmt, args...) dev_dbg(&az_blob_dev.device->device, fmt, ##args)
+#define az_blob_info(fmt, args...) dev_info(&az_blob_dev.device->device, fmt, ##args)
+#define az_blob_warn(fmt, args...) dev_warn(&az_blob_dev.device->device, fmt, ##args)
+#define az_blob_err(fmt, args...) dev_err(&az_blob_dev.device->device, fmt, ##args)
+
+static void az_blob_on_channel_callback(void *context)
+{
+ struct vmbus_channel *channel = (struct vmbus_channel *)context;
+ const struct vmpacket_descriptor *desc;
+
+ az_blob_dbg("entering interrupt from vmbus\n");
+ foreach_vmbus_pkt(desc, channel) {
+ struct az_blob_vsp_request_ctx *request_ctx;
+ struct az_blob_vsp_response *response;
+ u64 cmd_rqst = vmbus_request_addr(&channel->requestor,
+ desc->trans_id);
+ if (cmd_rqst == VMBUS_RQST_ERROR) {
+ az_blob_err("incorrect transaction id %llu\n",
+ desc->trans_id);
+ continue;
+ }
+ request_ctx = (struct az_blob_vsp_request_ctx *)cmd_rqst;
+ response = hv_pkt_data(desc);
+
+ az_blob_dbg("got response for request %pUb status %u "
+ "response_len %u\n",
+ &request_ctx->request->guid, response->error,
+ response->response_len);
+ request_ctx->request->response.status = response->error;
+ request_ctx->request->response.response_len =
+ response->response_len;
+ complete(&request_ctx->wait_vsp);
+ }
+}
+
+static int az_blob_fop_open(struct inode *inode, struct file *file)
+{
+ struct az_blob_file_ctx *file_ctx;
+ unsigned long flags;
+
+ file_ctx = kzalloc(sizeof(*file_ctx), GFP_KERNEL);
+ if (!file_ctx)
+ return -ENOMEM;
+
+ rcu_read_lock();
+
+ if (az_blob_dev.removing) {
+ rcu_read_unlock();
+ kfree(file_ctx);
+ return -ENODEV;
+ }
+
+ INIT_LIST_HEAD(&file_ctx->vsp_pending_requests);
+ init_waitqueue_head(&file_ctx->wait_vsp_pending);
+ spin_lock_init(&file_ctx->vsp_pending_lock);
+ file->private_data = file_ctx;
+
+ spin_lock_irqsave(&az_blob_dev.file_lock, flags);
+ list_add_tail(&file_ctx->list, &az_blob_dev.file_list);
+ spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
+
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int az_blob_fop_release(struct inode *inode, struct file *file)
+{
+ struct az_blob_file_ctx *file_ctx = file->private_data;
+ unsigned long flags;
+
+ wait_event(file_ctx->wait_vsp_pending,
+ list_empty(&file_ctx->vsp_pending_requests));
+
+ spin_lock_irqsave(&az_blob_dev.file_lock, flags);
+ list_del(&file_ctx->list);
+ if (list_empty(&az_blob_dev.file_list))
+ wake_up(&az_blob_dev.file_wait);
+ spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
+
+ kfree(file_ctx);
+
+ return 0;
+}
+
+static inline bool az_blob_safe_file_access(struct file *file)
+{
+ return file->f_cred == current_cred() && !uaccess_kernel();
+}
+
+static int get_buffer_pages(int rw, void __user *buffer, u32 buffer_len,
+ struct page ***ppages, size_t *start,
+ size_t *num_pages)
+{
+ struct iovec iov;
+ struct iov_iter iter;
+ int ret;
+ ssize_t result;
+ struct page **pages;
+
+ ret = import_single_range(rw, buffer, buffer_len, &iov, &iter);
+ if (ret) {
+ az_blob_dbg("request buffer access error %d\n", ret);
+ return ret;
+ }
+ az_blob_dbg("iov_iter type %d offset %lu count %lu nr_segs %lu\n",
+ iter.type, iter.iov_offset, iter.count, iter.nr_segs);
+
+ result = iov_iter_get_pages_alloc(&iter, &pages, buffer_len, start);
+ if (result < 0) {
+ az_blob_dbg("failed to pin user pages result=%ld\n", result);
+ return result;
+ }
+ if (result != buffer_len) {
+ az_blob_dbg("can't pin user pages requested %d got %ld\n",
+ buffer_len, result);
+ return -EFAULT;
+ }
+
+ *ppages = pages;
+ *num_pages = (result + *start + PAGE_SIZE - 1) / PAGE_SIZE;
+ return 0;
+}
+
+static void fill_in_page_buffer(u64 *pfn_array, int *index,
+ struct page **pages, unsigned long num_pages)
+{
+ int i, page_idx = *index;
+
+ for (i = 0; i < num_pages; i++)
+ pfn_array[page_idx++] = page_to_pfn(pages[i]);
+ *index = page_idx;
+}
+
+static void free_buffer_pages(size_t num_pages, struct page **pages)
+{
+ unsigned long i;
+
+ for (i = 0; i < num_pages; i++)
+ if (pages[i])
+ put_page(pages[i]);
+ kvfree(pages);
+}
+
+static long az_blob_ioctl_user_request(struct file *filp, unsigned long arg)
+{
+ struct az_blob_device *dev = &az_blob_dev;
+ struct az_blob_file_ctx *file_ctx = filp->private_data;
+ struct az_blob_request_sync __user *request_user =
+ (struct az_blob_request_sync __user *)arg;
+ struct az_blob_request_sync request;
+ struct az_blob_vsp_request_ctx request_ctx;
+ unsigned long flags;
+ int ret;
+ size_t request_start, request_num_pages = 0;
+ size_t response_start, response_num_pages = 0;
+ size_t data_start, data_num_pages = 0, total_num_pages;
+ struct page **request_pages = NULL, **response_pages = NULL;
+ struct page **data_pages = NULL;
+ struct vmbus_packet_mpb_array *desc;
+ u64 *pfn_array;
+ int desc_size;
+ int page_idx;
+ struct az_blob_vsp_request *vsp_request;
+
+ /* Fast fail if device is being removed */
+ if (dev->removing)
+ return -ENODEV;
+
+ if (!az_blob_safe_file_access(filp)) {
+ az_blob_dbg("process %d(%s) changed security contexts after"
+ " opening file descriptor\n",
+ task_tgid_vnr(current), current->comm);
+ return -EACCES;
+ }
+
+ if (copy_from_user(&request, request_user, sizeof(request))) {
+ az_blob_dbg("don't have permission to user provided buffer\n");
+ return -EFAULT;
+ }
+
+ az_blob_dbg("az_blob ioctl request guid %pUb timeout %u request_len %u"
+ " response_len %u data_len %u request_buffer %llx "
+ "response_buffer %llx data_buffer %llx\n",
+ &request.guid, request.timeout, request.request_len,
+ request.response_len, request.data_len, request.request_buffer,
+ request.response_buffer, request.data_buffer);
+
+ if (!request.request_len || !request.response_len)
+ return -EINVAL;
+
+ if (request.data_len && request.data_len < request.data_valid)
+ return -EINVAL;
+
+ if (request.data_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES ||
+ request.request_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES ||
+ request.response_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES)
+ return -EINVAL;
+
+ init_completion(&request_ctx.wait_vsp);
+ request_ctx.request = &request;
+
+ /*
+ * Need to set rw to READ to have page table set up for passing to VSP.
+ * Setting it to WRITE will cause the page table entry not allocated
+ * as it's waiting on Copy-On-Write on next page fault. This doesn't
+ * work in this scenario because VSP wants all the pages to be present.
+ */
+ ret = get_buffer_pages(READ, (void __user *)request.request_buffer,
+ request.request_len, &request_pages,
+ &request_start, &request_num_pages);
+ if (ret)
+ goto get_user_page_failed;
+
+ ret = get_buffer_pages(READ, (void __user *)request.response_buffer,
+ request.response_len, &response_pages,
+ &response_start, &response_num_pages);
+ if (ret)
+ goto get_user_page_failed;
+
+ if (request.data_len) {
+ ret = get_buffer_pages(READ,
+ (void __user *)request.data_buffer,
+ request.data_len, &data_pages,
+ &data_start, &data_num_pages);
+ if (ret)
+ goto get_user_page_failed;
+ }
+
+ total_num_pages = request_num_pages + response_num_pages +
+ data_num_pages;
+ if (total_num_pages > AZ_BLOB_MAX_PAGES) {
+ az_blob_dbg("number of DMA pages %lu buffer exceeding %u\n",
+ total_num_pages, AZ_BLOB_MAX_PAGES);
+ ret = -EINVAL;
+ goto get_user_page_failed;
+ }
+
+ /* Construct a VMBUS packet and send it over to VSP */
+ desc_size = struct_size(desc, range.pfn_array, total_num_pages);
+ desc = kzalloc(desc_size, GFP_KERNEL);
+ vsp_request = kzalloc(sizeof(*vsp_request), GFP_KERNEL);
+ if (!desc || !vsp_request) {
+ kfree(desc);
+ kfree(vsp_request);
+ ret = -ENOMEM;
+ goto get_user_page_failed;
+ }
+
+ desc->range.offset = 0;
+ desc->range.len = total_num_pages * PAGE_SIZE;
+ pfn_array = desc->range.pfn_array;
+ page_idx = 0;
+
+ if (request.data_len) {
+ fill_in_page_buffer(pfn_array, &page_idx, data_pages,
+ data_num_pages);
+ vsp_request->data_buffer_offset = data_start;
+ vsp_request->data_buffer_length = request.data_len;
+ vsp_request->data_buffer_valid = request.data_valid;
+ }
+
+ fill_in_page_buffer(pfn_array, &page_idx, request_pages,
+ request_num_pages);
+ vsp_request->request_buffer_offset = request_start +
+ data_num_pages * PAGE_SIZE;
+ vsp_request->request_buffer_length = request.request_len;
+
+ fill_in_page_buffer(pfn_array, &page_idx, response_pages,
+ response_num_pages);
+ vsp_request->response_buffer_offset = response_start +
+ (data_num_pages + request_num_pages) * PAGE_SIZE;
+ vsp_request->response_buffer_length = request.response_len;
+
+ vsp_request->version = 0;
+ vsp_request->timeout_ms = request.timeout;
+ vsp_request->operation_type = AZ_BLOB_DRIVER_USER_REQUEST;
+ guid_copy(&vsp_request->transaction_id, &request.guid);
+
+ spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
+ list_add_tail(&request_ctx.list, &file_ctx->vsp_pending_requests);
+ spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
+
+ az_blob_dbg("sending request to VSP\n");
+ az_blob_dbg("desc_size %u desc->range.len %u desc->range.offset %u\n",
+ desc_size, desc->range.len, desc->range.offset);
+ az_blob_dbg("vsp_request data_buffer_offset %u data_buffer_length %u "
+ "data_buffer_valid %u request_buffer_offset %u "
+ "request_buffer_length %u response_buffer_offset %u "
+ "response_buffer_length %u\n",
+ vsp_request->data_buffer_offset,
+ vsp_request->data_buffer_length,
+ vsp_request->data_buffer_valid,
+ vsp_request->request_buffer_offset,
+ vsp_request->request_buffer_length,
+ vsp_request->response_buffer_offset,
+ vsp_request->response_buffer_length);
+
+ ret = vmbus_sendpacket_mpb_desc(dev->device->channel, desc, desc_size,
+ vsp_request, sizeof(*vsp_request),
+ (u64)&request_ctx);
+
+ kfree(desc);
+ kfree(vsp_request);
+ if (ret)
+ goto vmbus_send_failed;
+
+ wait_for_completion(&request_ctx.wait_vsp);
+
+ /*
+ * At this point, the response is already written to request
+ * by VMBUS completion handler, copy them to user-mode buffers
+ * and return to user-mode
+ */
+ if (copy_to_user(&request_user->response, &request.response,
+ sizeof(request.response)))
+ ret = -EFAULT;
+
+vmbus_send_failed:
+ spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
+ list_del(&request_ctx.list);
+ if (list_empty(&file_ctx->vsp_pending_requests))
+ wake_up(&file_ctx->wait_vsp_pending);
+ spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
+
+get_user_page_failed:
+ free_buffer_pages(request_num_pages, request_pages);
+ free_buffer_pages(response_num_pages, response_pages);
+ free_buffer_pages(data_num_pages, data_pages);
+
+ return ret;
+}
+
+static long az_blob_fop_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case IOCTL_AZ_BLOB_DRIVER_USER_REQUEST:
+ return az_blob_ioctl_user_request(filp, arg);
+
+ default:
+ az_blob_dbg("unrecognized IOCTL code %u\n", cmd);
+ }
+
+ return -EINVAL;
+}
+
+static const struct file_operations az_blob_client_fops = {
+ .owner = THIS_MODULE,
+ .open = az_blob_fop_open,
+ .unlocked_ioctl = az_blob_fop_ioctl,
+ .release = az_blob_fop_release,
+};
+
+static struct miscdevice az_blob_misc_device = {
+ MISC_DYNAMIC_MINOR,
+ "azure_blob",
+ &az_blob_client_fops,
+};
+
+static int az_blob_show_pending_requests(struct seq_file *m, void *v)
+{
+ unsigned long flags, flags2;
+ struct az_blob_vsp_request_ctx *request_ctx;
+ struct az_blob_file_ctx *file_ctx;
+
+ seq_puts(m, "List of pending requests\n");
+ seq_puts(m, "UUID request_len response_len data_len "
+ "request_buffer response_buffer data_buffer\n");
+ spin_lock_irqsave(&az_blob_dev.file_lock, flags);
+ list_for_each_entry(file_ctx, &az_blob_dev.file_list, list) {
+ spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags2);
+ list_for_each_entry(request_ctx,
+ &file_ctx->vsp_pending_requests, list) {
+ seq_printf(m, "%pUb ", &request_ctx->request->guid);
+ seq_printf(m, "%u ", request_ctx->request->request_len);
+ seq_printf(m, "%u ",
+ request_ctx->request->response_len);
+ seq_printf(m, "%u ", request_ctx->request->data_len);
+ seq_printf(m, "%llx ",
+ request_ctx->request->request_buffer);
+ seq_printf(m, "%llx ",
+ request_ctx->request->response_buffer);
+ seq_printf(m, "%llx\n",
+ request_ctx->request->data_buffer);
+ }
+ spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags2);
+ }
+ spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
+
+ return 0;
+}
+
+static int az_blob_debugfs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, az_blob_show_pending_requests, NULL);
+}
+
+static const struct file_operations az_blob_debugfs_fops = {
+ .open = az_blob_debugfs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+static void az_blob_remove_device(void)
+{
+ struct dentry *debugfs_root = debugfs_lookup("az_blob", NULL);
+
+ misc_deregister(&az_blob_misc_device);
+ debugfs_remove_recursive(debugfs_lookup("pending_requests",
+ debugfs_root));
+ debugfs_remove_recursive(debugfs_root);
+ /* At this point, we won't get any requests from user-mode */
+}
+
+static int az_blob_create_device(struct az_blob_device *dev)
+{
+ int ret;
+ struct dentry *debugfs_root;
+
+ ret = misc_register(&az_blob_misc_device);
+ if (ret)
+ return ret;
+
+ debugfs_root = debugfs_create_dir("az_blob", NULL);
+ debugfs_create_file("pending_requests", 0400, debugfs_root, NULL,
+ &az_blob_debugfs_fops);
+
+ return 0;
+}
+
+static int az_blob_connect_to_vsp(struct hv_device *device, u32 ring_size)
+{
+ int ret;
+
+ spin_lock_init(&az_blob_dev.file_lock);
+ INIT_LIST_HEAD(&az_blob_dev.file_list);
+ init_waitqueue_head(&az_blob_dev.file_wait);
+
+ az_blob_dev.device = device;
+ device->channel->rqstor_size = AZ_BLOB_QUEUE_DEPTH;
+
+ ret = vmbus_open(device->channel, ring_size, ring_size, NULL, 0,
+ az_blob_on_channel_callback, device->channel);
+
+ if (ret)
+ return ret;
+
+ hv_set_drvdata(device, &az_blob_dev);
+
+ return ret;
+}
+
+static void az_blob_remove_vmbus(struct hv_device *device)
+{
+ /* At this point, no VSC/VSP traffic is possible over vmbus */
+ hv_set_drvdata(device, NULL);
+ vmbus_close(device->channel);
+}
+
+static int az_blob_probe(struct hv_device *device,
+ const struct hv_vmbus_device_id *dev_id)
+{
+ int ret;
+
+ ret = az_blob_connect_to_vsp(device, AZ_BLOB_RING_SIZE);
+ if (ret) {
+ az_blob_err("error connecting to VSP ret=%d\n", ret);
+ return ret;
+ }
+
+ // create user-mode client library facing device
+ ret = az_blob_create_device(&az_blob_dev);
+ if (ret) {
+ az_blob_err("failed to create device ret=%d\n", ret);
+ az_blob_remove_vmbus(device);
+ return ret;
+ }
+
+ az_blob_dev.removing = false;
+ az_blob_info("successfully probed device\n");
+
+ return 0;
+}
+
+static int az_blob_remove(struct hv_device *dev)
+{
+ az_blob_dev.removing = true;
+ /*
+ * RCU lock guarantees that any future calls to az_blob_fop_open()
+ * can not use device resources while the inode reference of
+ * /dev/azure_blob is being held for that open, and device file is
+ * being removed from /dev.
+ */
+ synchronize_rcu();
+
+ az_blob_info("removing device\n");
+ az_blob_remove_device();
+
+ /*
+ * At this point, it's not possible to open more files.
+ * Wait for all the opened files to be released.
+ */
+ wait_event(az_blob_dev.file_wait, list_empty(&az_blob_dev.file_list));
+
+ az_blob_remove_vmbus(dev);
+ return 0;
+}
+
+static struct hv_driver az_blob_drv = {
+ .name = KBUILD_MODNAME,
+ .id_table = id_table,
+ .probe = az_blob_probe,
+ .remove = az_blob_remove,
+ .driver = {
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
+};
+
+static int __init az_blob_drv_init(void)
+{
+ return vmbus_driver_register(&az_blob_drv);
+}
+
+static void __exit az_blob_drv_exit(void)
+{
+ vmbus_driver_unregister(&az_blob_drv);
+}
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Microsoft Azure Blob driver");
+module_init(az_blob_drv_init);
+module_exit(az_blob_drv_exit);
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 705e95d..3095611 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -154,6 +154,13 @@
.allowed_in_isolated = false,
},
+ /* Azure Blob */
+ { .dev_type = HV_AZURE_BLOB,
+ HV_AZURE_BLOB_GUID,
+ .perf_device = false,
+ .allowed_in_isolated = false,
+ },
+
/* Unknown GUID */
{ .dev_type = HV_UNKNOWN,
.perf_device = false,
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index d1e59db..ac31362 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -772,6 +772,7 @@ enum vmbus_device_type {
HV_FCOPY,
HV_BACKUP,
HV_DM,
+ HV_AZURE_BLOB,
HV_UNKNOWN,
};
@@ -1350,6 +1351,14 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f)
/*
+ * Azure Blob GUID
+ * {0590b792-db64-45cc-81db-b8d70c577c9e}
+ */
+#define HV_AZURE_BLOB_GUID \
+ .guid = GUID_INIT(0x0590b792, 0xdb64, 0x45cc, 0x81, 0xdb, \
+ 0xb8, 0xd7, 0x0c, 0x57, 0x7c, 0x9e)
+
+/*
* Shutdown GUID
* {0e0b6031-5213-4934-818b-38d90ced39db}
*/
diff --git a/include/uapi/misc/azure_blob.h b/include/uapi/misc/azure_blob.h
new file mode 100644
index 0000000..f4168679
--- /dev/null
+++ b/include/uapi/misc/azure_blob.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH Linux-syscall-note */
+/* Copyright (c) Microsoft Corporation. */
+
+#ifndef _AZ_BLOB_H
+#define _AZ_BLOB_H
+
+#include <linux/kernel.h>
+#include <linux/uuid.h>
+
+/* user-mode sync request sent through ioctl */
+struct az_blob_request_sync_response {
+ __u32 status;
+ __u32 response_len;
+};
+
+struct az_blob_request_sync {
+ guid_t guid;
+ __u32 timeout;
+ __u32 request_len;
+ __u32 response_len;
+ __u32 data_len;
+ __u32 data_valid;
+ __aligned_u64 request_buffer;
+ __aligned_u64 response_buffer;
+ __aligned_u64 data_buffer;
+ struct az_blob_request_sync_response response;
+};
+
+#define AZ_BLOB_MAGIC_NUMBER 'R'
+#define IOCTL_AZ_BLOB_DRIVER_USER_REQUEST \
+ _IOWR(AZ_BLOB_MAGIC_NUMBER, 0xf0, \
+ struct az_blob_request_sync)
+
+#endif /* define _AZ_BLOB_H */
--
1.8.3.1
On Tue, Jul 13, 2021 at 07:45:21PM -0700, [email protected] wrote:
> From: Long Li <[email protected]>
>
> Azure Blob storage provides scalable and durable data storage for Azure.
> (https://azure.microsoft.com/en-us/services/storage/blobs/)
>
> This driver adds support for accelerated access to Azure Blob storage. As an
> alternative to REST APIs, it provides a fast data path that uses host native
> network stack and secure direct data link for storage server access.
So it goes around the block layer? Why?
> This driver will be ported to FreeBSD. It's dual licensed for BSD and GPL.
Being the copyright holder, you are free to relicense this code to any
other license you want to. So why is this single HV driver different
from all the other ones in this regard when it comes to the license?
Given that this driver only works when talking to GPL-only symbols in
the kernel, how could it be ported to freebsd as-is by anyone who is not
the copyright holder?
> Cc: Jonathan Corbet <[email protected]>
> Cc: "K. Y. Srinivasan" <[email protected]>
> Cc: Haiyang Zhang <[email protected]>
> Cc: Stephen Hemminger <[email protected]>
> Cc: Wei Liu <[email protected]>
> Cc: Dexuan Cui <[email protected]>
> Cc: Greg Kroah-Hartman <[email protected]>
> Cc: Bjorn Andersson <[email protected]>
> Cc: Hans de Goede <[email protected]>
> Cc: Dan Williams <[email protected]>
> Cc: Maximilian Luz <[email protected]>
> Cc: Mike Rapoport <[email protected]>
> Cc: Ben Widawsky <[email protected]>
> Cc: Jiri Slaby <[email protected]>
> Cc: Andra Paraschiv <[email protected]>
> Cc: Siddharth Gupta <[email protected]>
> Cc: Hannes Reinecke <[email protected]>
> Cc: [email protected]
> Signed-off-by: Long Li <[email protected]>
> ---
> Documentation/userspace-api/ioctl/ioctl-number.rst | 2 +
> drivers/hv/Kconfig | 10 +
> drivers/hv/Makefile | 1 +
> drivers/hv/azure_blob.c | 625 +++++++++++++++++++++
> drivers/hv/channel_mgmt.c | 7 +
> include/linux/hyperv.h | 9 +
> include/uapi/misc/azure_blob.h | 34 ++
> 7 files changed, 688 insertions(+)
> create mode 100644 drivers/hv/azure_blob.c
> create mode 100644 include/uapi/misc/azure_blob.h
>
> diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
> index 9bfc2b5..d3c2a90 100644
> --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> @@ -180,6 +180,8 @@ Code Seq# Include File Comments
> 'R' 01 linux/rfkill.h conflict!
> 'R' C0-DF net/bluetooth/rfcomm.h
> 'R' E0 uapi/linux/fsl_mc.h
> +'R' F0-FF uapi/misc/azure_blob.h Microsoft Azure Blob driver
> + <mailto:[email protected]>
> 'S' all linux/cdrom.h conflict!
> 'S' 80-81 scsi/scsi_ioctl.h conflict!
> 'S' 82-FF scsi/scsi.h conflict!
> diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
> index 66c794d..e08b8d3 100644
> --- a/drivers/hv/Kconfig
> +++ b/drivers/hv/Kconfig
> @@ -27,4 +27,14 @@ config HYPERV_BALLOON
> help
> Select this option to enable Hyper-V Balloon driver.
>
> +config HYPERV_AZURE_BLOB
> + tristate "Microsoft Azure Blob driver"
> + depends on HYPERV && X86_64
> + help
> + Select this option to enable Microsoft Azure Blob driver.
> +
> + This driver supports accelerated Microsoft Azure Blob access.
No definition of what this is?
> + To compile this driver as a module, choose M here. The module will be
> + called azure_blob.
> +
> endmenu
> diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
> index 94daf82..a322575 100644
> --- a/drivers/hv/Makefile
> +++ b/drivers/hv/Makefile
> @@ -2,6 +2,7 @@
> obj-$(CONFIG_HYPERV) += hv_vmbus.o
> obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
> obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
> +obj-$(CONFIG_HYPERV_AZURE_BLOB) += azure_blob.o
Your naming scheme is different from the other hv modules, why?
>
> CFLAGS_hv_trace.o = -I$(src)
> CFLAGS_hv_balloon.o = -I$(src)
> diff --git a/drivers/hv/azure_blob.c b/drivers/hv/azure_blob.c
> new file mode 100644
> index 0000000..5367d5e
> --- /dev/null
> +++ b/drivers/hv/azure_blob.c
> @@ -0,0 +1,625 @@
> +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH Linux-syscall-note
.c files can NOT have the syscall note license, as that makes no sense
at all.
I'm stopping here. Please go run this through your legal department and
get them to sign off on this as it does not seem that you all understand
the issues when it comes to licenses and the Linux kernel at all. I
want to see a lawyer sign off on this patch next time if you all want to
attempt something crazy like this.
> +/* Copyright (c) Microsoft Corporation. */
You forgot a date, your lawyers will be signing you up for some
education classes now, have fun!
gre gk-h
One final thing:
On Tue, Jul 13, 2021 at 07:45:21PM -0700, [email protected] wrote:
> +#define az_blob_dbg(fmt, args...) dev_dbg(&az_blob_dev.device->device, fmt, ##args)
> +#define az_blob_info(fmt, args...) dev_info(&az_blob_dev.device->device, fmt, ##args)
> +#define az_blob_warn(fmt, args...) dev_warn(&az_blob_dev.device->device, fmt, ##args)
> +#define az_blob_err(fmt, args...) dev_err(&az_blob_dev.device->device, fmt, ##args)
This tiny driver does not deserve a different loging scheme from all of
the rest of the kernel. Just use the dev_* calls directly please, no
need to try to roll your own. Especially for a 600 line file.
And remove your calls to az_blob_info() that are just debugging stuff,
when the driver is working properly, it is quiet.
thanks,
greg k-h
On Tue, Jul 13, 2021 at 07:45:21PM -0700, [email protected] wrote:
> +static struct miscdevice az_blob_misc_device = {
> + MISC_DYNAMIC_MINOR,
> + "azure_blob",
> + &az_blob_client_fops,
> +};
Named initializers please.
On Tue, Jul 13, 2021 at 07:45:21PM -0700, [email protected] wrote:
> From: Long Li <[email protected]>
>
> Azure Blob storage provides scalable and durable data storage for Azure.
> (https://azure.microsoft.com/en-us/services/storage/blobs/)
>
> This driver adds support for accelerated access to Azure Blob storage. As an
> alternative to REST APIs, it provides a fast data path that uses host native
> network stack and secure direct data link for storage server access.
Where is the userspace code that interacts with this driver through your
custom ioctl interface?
thanks,
greg k-h
> Subject: Re: [Patch v3 2/3] Drivers: hv: add Azure Blob driver
>
> On Tue, Jul 13, 2021 at 07:45:21PM -0700, [email protected] wrote:
> > From: Long Li <[email protected]>
> >
> > Azure Blob storage provides scalable and durable data storage for Azure.
> >
> (https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fazu
> > re.microsoft.com%2Fen-
> us%2Fservices%2Fstorage%2Fblobs%2F&data=04%7
> >
> C01%7Clongli%40microsoft.com%7C950a81a410c54c2475a308d946ec0c09%7C
> 72f9
> >
> 88bf86f141af91ab2d7cd011db47%7C1%7C0%7C637618801920394272%7CUnk
> nown%7C
> >
> TWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiL
> CJXVC
> >
> I6Mn0%3D%7C1000&sdata=iLDP6EYsoCDlLvToJB31hIQxW3NdCnR0UH31
> FRDwvRI%
> > 3D&reserved=0)
> >
> > This driver adds support for accelerated access to Azure Blob storage.
> > As an alternative to REST APIs, it provides a fast data path that uses
> > host native network stack and secure direct data link for storage server
> access.
>
> So it goes around the block layer? Why?
Azure Blob is object-oriented storage solution designed for cloud environments.
While it's entirely possible to go through block layer, it's not as efficient as using
its native APIs for data access. Some of the security features (authentication,
tokens, lifecycle management) are not easily integrated into block layer. The
object model in Azure Blob is designed to be scalable and doesn't have many
limitations that block layer enforces (e.g. number of sectors).
Please refer to this link for different storage models in Azure:
https://docs.microsoft.com/en-us/azure/storage/common/storage-introduction#core-storage-services
>
> > This driver will be ported to FreeBSD. It's dual licensed for BSD and GPL.
>
> Being the copyright holder, you are free to relicense this code to any other
> license you want to. So why is this single HV driver different from all the
> other ones in this regard when it comes to the license?
>
> Given that this driver only works when talking to GPL-only symbols in the
> kernel, how could it be ported to freebsd as-is by anyone who is not the
> copyright holder?
>
> > Cc: Jonathan Corbet <[email protected]>
> > Cc: "K. Y. Srinivasan" <[email protected]>
> > Cc: Haiyang Zhang <[email protected]>
> > Cc: Stephen Hemminger <[email protected]>
> > Cc: Wei Liu <[email protected]>
> > Cc: Dexuan Cui <[email protected]>
> > Cc: Greg Kroah-Hartman <[email protected]>
> > Cc: Bjorn Andersson <[email protected]>
> > Cc: Hans de Goede <[email protected]>
> > Cc: Dan Williams <[email protected]>
> > Cc: Maximilian Luz <[email protected]>
> > Cc: Mike Rapoport <[email protected]>
> > Cc: Ben Widawsky <[email protected]>
> > Cc: Jiri Slaby <[email protected]>
> > Cc: Andra Paraschiv <[email protected]>
> > Cc: Siddharth Gupta <[email protected]>
> > Cc: Hannes Reinecke <[email protected]>
> > Cc: [email protected]
> > Signed-off-by: Long Li <[email protected]>
> > ---
> > Documentation/userspace-api/ioctl/ioctl-number.rst | 2 +
> > drivers/hv/Kconfig | 10 +
> > drivers/hv/Makefile | 1 +
> > drivers/hv/azure_blob.c | 625 +++++++++++++++++++++
> > drivers/hv/channel_mgmt.c | 7 +
> > include/linux/hyperv.h | 9 +
> > include/uapi/misc/azure_blob.h | 34 ++
> > 7 files changed, 688 insertions(+)
> > create mode 100644 drivers/hv/azure_blob.c create mode 100644
> > include/uapi/misc/azure_blob.h
> >
> > diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst
> > b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > index 9bfc2b5..d3c2a90 100644
> > --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> > +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > @@ -180,6 +180,8 @@ Code Seq# Include File
> Comments
> > 'R' 01 linux/rfkill.h conflict!
> > 'R' C0-DF net/bluetooth/rfcomm.h
> > 'R' E0 uapi/linux/fsl_mc.h
> > +'R' F0-FF uapi/misc/azure_blob.h Microsoft Azure Blob
> driver
> > +
> > +<mailto:[email protected]>
> > 'S' all linux/cdrom.h conflict!
> > 'S' 80-81 scsi/scsi_ioctl.h conflict!
> > 'S' 82-FF scsi/scsi.h conflict!
> > diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index
> > 66c794d..e08b8d3 100644
> > --- a/drivers/hv/Kconfig
> > +++ b/drivers/hv/Kconfig
> > @@ -27,4 +27,14 @@ config HYPERV_BALLOON
> > help
> > Select this option to enable Hyper-V Balloon driver.
> >
> > +config HYPERV_AZURE_BLOB
> > + tristate "Microsoft Azure Blob driver"
> > + depends on HYPERV && X86_64
> > + help
> > + Select this option to enable Microsoft Azure Blob driver.
> > +
> > + This driver supports accelerated Microsoft Azure Blob access.
>
> No definition of what this is?
>
> > + To compile this driver as a module, choose M here. The module will
> be
> > + called azure_blob.
> > +
> > endmenu
> > diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index
> > 94daf82..a322575 100644
> > --- a/drivers/hv/Makefile
> > +++ b/drivers/hv/Makefile
> > @@ -2,6 +2,7 @@
> > obj-$(CONFIG_HYPERV) += hv_vmbus.o
> > obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
> > obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
> > +obj-$(CONFIG_HYPERV_AZURE_BLOB) += azure_blob.o
>
> Your naming scheme is different from the other hv modules, why?
>
> >
> > CFLAGS_hv_trace.o = -I$(src)
> > CFLAGS_hv_balloon.o = -I$(src)
> > diff --git a/drivers/hv/azure_blob.c b/drivers/hv/azure_blob.c new
> > file mode 100644 index 0000000..5367d5e
> > --- /dev/null
> > +++ b/drivers/hv/azure_blob.c
> > @@ -0,0 +1,625 @@
> > +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH
> > +Linux-syscall-note
>
> .c files can NOT have the syscall note license, as that makes no sense at all.
>
> I'm stopping here. Please go run this through your legal department and get
> them to sign off on this as it does not seem that you all understand the issues
> when it comes to licenses and the Linux kernel at all. I want to see a lawyer
> sign off on this patch next time if you all want to attempt something crazy like
> this.
>
> > +/* Copyright (c) Microsoft Corporation. */
>
> You forgot a date, your lawyers will be signing you up for some education
> classes now, have fun!
>
> gre gk-h
I will address your comments on licensing.
Long
> Subject: Re: [Patch v3 2/3] Drivers: hv: add Azure Blob driver
>
> On Tue, Jul 13, 2021 at 07:45:21PM -0700, [email protected] wrote:
> > From: Long Li <[email protected]>
> >
> > Azure Blob storage provides scalable and durable data storage for Azure.
> >
> (https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fazu
> > re.microsoft.com%2Fen-
> us%2Fservices%2Fstorage%2Fblobs%2F&data=04%7
> >
> C01%7Clongli%40microsoft.com%7Ccb3aaf42b6744c4a565e08d946eca3e1%7C
> 72f9
> >
> 88bf86f141af91ab2d7cd011db47%7C1%7C0%7C637618804484490378%7CUnk
> nown%7C
> >
> TWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiL
> CJXVC
> >
> I6Mn0%3D%7C1000&sdata=XcAiTOWwDUWM7z2qt80vBL0YuS%2BdzsM
> 6T9JhVzcK8n
> > U%3D&reserved=0)
> >
> > This driver adds support for accelerated access to Azure Blob storage.
> > As an alternative to REST APIs, it provides a fast data path that uses
> > host native network stack and secure direct data link for storage server
> access.
>
> Where is the userspace code that interacts with this driver through your
> custom ioctl interface?
The user-space code is being developed on all supported languages for Blob access,
and will be placed in the github:
https://github.com/Azure/azure-sdk
I will add that information to the patch.
Long
>
> thanks,
>
> greg k-h
> Subject: Re: [Patch v3 2/3] Drivers: hv: add Azure Blob driver
>
> On Tue, Jul 13, 2021 at 07:45:21PM -0700, [email protected] wrote:
> > +static struct miscdevice az_blob_misc_device = {
> > + MISC_DYNAMIC_MINOR,
> > + "azure_blob",
> > + &az_blob_client_fops,
> > +};
>
> Named initializers please.
Will fix this along with the logging functions.
Long
On Tue, Jul 13, 2021 at 07:45:22PM -0700, [email protected] wrote:
> From: Long Li <[email protected]>
>
> Add [email protected] to maintainer list for Azure Blob driver.
>
> Cc: K. Y. Srinivasan <[email protected]>
> Cc: Haiyang Zhang <[email protected]>
> Cc: Stephen Hemminger <[email protected]>
> Cc: Wei Liu <[email protected]>
> Cc: Dexuan Cui <[email protected]>
> Signed-off-by: Long Li <[email protected]>
> ---
> MAINTAINERS | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 9487061..b547eb9 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -8440,6 +8440,7 @@ M: Haiyang Zhang <[email protected]>
> M: Stephen Hemminger <[email protected]>
> M: Wei Liu <[email protected]>
> M: Dexuan Cui <[email protected]>
> +M: Long Li <[email protected]>
You just added yourself to the maintainer of _all_ of the hyperv
drivers, was that intentional?
thanks,
greg k-h
On Wed, Jul 14, 2021 at 09:14:13PM +0000, Long Li wrote:
> > Subject: Re: [Patch v3 2/3] Drivers: hv: add Azure Blob driver
> >
> > On Tue, Jul 13, 2021 at 07:45:21PM -0700, [email protected] wrote:
> > > From: Long Li <[email protected]>
> > >
> > > Azure Blob storage provides scalable and durable data storage for Azure.
> > >
> > (https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fazu
> > > re.microsoft.com%2Fen-
> > us%2Fservices%2Fstorage%2Fblobs%2F&data=04%7
> > >
> > C01%7Clongli%40microsoft.com%7C950a81a410c54c2475a308d946ec0c09%7C
> > 72f9
> > >
> > 88bf86f141af91ab2d7cd011db47%7C1%7C0%7C637618801920394272%7CUnk
> > nown%7C
> > >
> > TWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiL
> > CJXVC
> > >
> > I6Mn0%3D%7C1000&sdata=iLDP6EYsoCDlLvToJB31hIQxW3NdCnR0UH31
> > FRDwvRI%
> > > 3D&reserved=0)
> > >
> > > This driver adds support for accelerated access to Azure Blob storage.
> > > As an alternative to REST APIs, it provides a fast data path that uses
> > > host native network stack and secure direct data link for storage server
> > access.
> >
> > So it goes around the block layer? Why?
>
> Azure Blob is object-oriented storage solution designed for cloud environments.
> While it's entirely possible to go through block layer, it's not as efficient as using
> its native APIs for data access. Some of the security features (authentication,
> tokens, lifecycle management) are not easily integrated into block layer. The
> object model in Azure Blob is designed to be scalable and doesn't have many
> limitations that block layer enforces (e.g. number of sectors).
What does the kernel's filesystem and block developers and maintainers
think about this end-run around that subsystem? Do they agree with this
one-off-custom-ioctl api? Who has reviewed this api to assure you that
it is sane and actually works properly?
And are you sure that the block layer is not efficient? You just lost
all use of caching and io_uring and loads of other kernel infrastructure
that has been developed and relied on for decades. You also just
abandoned the POSIX model and forced people to use a
random-custom-library just to access their storage devices, breaking all
existing programs in the world.
What programs today use this new api? Where is the api published and
what ensures that it will remain stable? What happens when it changes
over time, do we have to rebuild all userspace applications? What
happens to the kernel code over time, how do you handle changes to the
api there?
You are going to need a lot of people to agree that this is ok to
circumvent in order to allow this to be accepted, have you done that
yet?
> Please refer to this link for different storage models in Azure:
> https://docs.microsoft.com/en-us/azure/storage/common/storage-introduction#core-storage-services
Links are not good for changelogs as the kernel changelog is for
forever, while random company urls we have no control over.
thanks,
greg k-h
From: [email protected] <[email protected]> Sent: Tuesday, July 13, 2021 7:45 PM
>
> Azure Blob storage provides scalable and durable data storage for Azure.
> (https://azure.microsoft.com/en-us/services/storage/blobs/)
>
> This driver adds support for accelerated access to Azure Blob storage. As an
> alternative to REST APIs, it provides a fast data path that uses host native
> network stack and secure direct data link for storage server access.
>
> This driver will be ported to FreeBSD. It's dual licensed for BSD and GPL.
>
> Cc: Jonathan Corbet <[email protected]>
> Cc: "K. Y. Srinivasan" <[email protected]>
> Cc: Haiyang Zhang <[email protected]>
> Cc: Stephen Hemminger <[email protected]>
> Cc: Wei Liu <[email protected]>
> Cc: Dexuan Cui <[email protected]>
> Cc: Greg Kroah-Hartman <[email protected]>
> Cc: Bjorn Andersson <[email protected]>
> Cc: Hans de Goede <[email protected]>
> Cc: Dan Williams <[email protected]>
> Cc: Maximilian Luz <[email protected]>
> Cc: Mike Rapoport <[email protected]>
> Cc: Ben Widawsky <[email protected]>
> Cc: Jiri Slaby <[email protected]>
> Cc: Andra Paraschiv <[email protected]>
> Cc: Siddharth Gupta <[email protected]>
> Cc: Hannes Reinecke <[email protected]>
> Cc: [email protected]
> Signed-off-by: Long Li <[email protected]>
> ---
> Documentation/userspace-api/ioctl/ioctl-number.rst | 2 +
> drivers/hv/Kconfig | 10 +
> drivers/hv/Makefile | 1 +
> drivers/hv/azure_blob.c | 625 +++++++++++++++++++++
> drivers/hv/channel_mgmt.c | 7 +
> include/linux/hyperv.h | 9 +
> include/uapi/misc/azure_blob.h | 34 ++
> 7 files changed, 688 insertions(+)
> create mode 100644 drivers/hv/azure_blob.c
> create mode 100644 include/uapi/misc/azure_blob.h
>
> diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
> index 9bfc2b5..d3c2a90 100644
> --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> @@ -180,6 +180,8 @@ Code Seq# Include File Comments
> 'R' 01 linux/rfkill.h conflict!
> 'R' C0-DF net/bluetooth/rfcomm.h
> 'R' E0 uapi/linux/fsl_mc.h
> +'R' F0-FF uapi/misc/azure_blob.h Microsoft Azure Blob driver
> + <mailto:[email protected]>
> 'S' all linux/cdrom.h conflict!
> 'S' 80-81 scsi/scsi_ioctl.h conflict!
> 'S' 82-FF scsi/scsi.h conflict!
> diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
> index 66c794d..e08b8d3 100644
> --- a/drivers/hv/Kconfig
> +++ b/drivers/hv/Kconfig
> @@ -27,4 +27,14 @@ config HYPERV_BALLOON
> help
> Select this option to enable Hyper-V Balloon driver.
>
> +config HYPERV_AZURE_BLOB
> + tristate "Microsoft Azure Blob driver"
> + depends on HYPERV && X86_64
> + help
> + Select this option to enable Microsoft Azure Blob driver.
> +
> + This driver supports accelerated Microsoft Azure Blob access.
> + To compile this driver as a module, choose M here. The module will be
> + called azure_blob.
> +
> endmenu
> diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
> index 94daf82..a322575 100644
> --- a/drivers/hv/Makefile
> +++ b/drivers/hv/Makefile
> @@ -2,6 +2,7 @@
> obj-$(CONFIG_HYPERV) += hv_vmbus.o
> obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
> obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
> +obj-$(CONFIG_HYPERV_AZURE_BLOB) += azure_blob.o
>
> CFLAGS_hv_trace.o = -I$(src)
> CFLAGS_hv_balloon.o = -I$(src)
> diff --git a/drivers/hv/azure_blob.c b/drivers/hv/azure_blob.c
> new file mode 100644
> index 0000000..5367d5e
> --- /dev/null
> +++ b/drivers/hv/azure_blob.c
> @@ -0,0 +1,625 @@
> +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH Linux-syscall-note
> +/* Copyright (c) Microsoft Corporation. */
> +
> +#include <uapi/misc/azure_blob.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <linux/cred.h>
> +#include <linux/debugfs.h>
> +#include <linux/pagemap.h>
> +#include <linux/hyperv.h>
> +#include <linux/miscdevice.h>
> +#include <linux/uio.h>
> +
> +struct az_blob_device {
> + struct hv_device *device;
> +
> + /* Opened files maintained by this device */
> + struct list_head file_list;
> + /* Lock for protecting file_list */
> + spinlock_t file_lock;
> + wait_queue_head_t file_wait;
> +
> + bool removing;
> +};
> +
> +/* VSP messages */
> +enum az_blob_vsp_request_type {
> + AZ_BLOB_DRIVER_REQUEST_FIRST = 0x100,
> + AZ_BLOB_DRIVER_USER_REQUEST = 0x100,
> + AZ_BLOB_DRIVER_REGISTER_BUFFER = 0x101,
> + AZ_BLOB_DRIVER_DEREGISTER_BUFFER = 0x102,
> +};
> +
> +/* VSC->VSP request */
> +struct az_blob_vsp_request {
> + u32 version;
> + u32 timeout_ms;
> + u32 data_buffer_offset;
> + u32 data_buffer_length;
> + u32 data_buffer_valid;
> + u32 operation_type;
> + u32 request_buffer_offset;
> + u32 request_buffer_length;
> + u32 response_buffer_offset;
> + u32 response_buffer_length;
> + guid_t transaction_id;
> +} __packed;
> +
> +/* VSP->VSC response */
> +struct az_blob_vsp_response {
> + u32 length;
> + u32 error;
> + u32 response_len;
> +} __packed;
> +
> +struct az_blob_vsp_request_ctx {
> + struct list_head list;
> + struct completion wait_vsp;
> + struct az_blob_request_sync *request;
> +};
> +
> +struct az_blob_file_ctx {
> + struct list_head list;
> +
> + /* List of pending requests to VSP */
> + struct list_head vsp_pending_requests;
> + /* Lock for protecting vsp_pending_requests */
> + spinlock_t vsp_pending_lock;
> + wait_queue_head_t wait_vsp_pending;
> +};
> +
> +/* The maximum number of pages we can pass to VSP in a single packet */
> +#define AZ_BLOB_MAX_PAGES 8192
> +
> +/* The one and only one */
> +static struct az_blob_device az_blob_dev;
> +
> +/* Ring buffer size in bytes */
> +#define AZ_BLOB_RING_SIZE (128 * 1024)
> +
> +/* System wide device queue depth */
> +#define AZ_BLOB_QUEUE_DEPTH 1024
> +
> +static const struct hv_vmbus_device_id id_table[] = {
> + { HV_AZURE_BLOB_GUID,
> + .driver_data = 0
> + },
> + { },
> +};
> +
> +#define az_blob_dbg(fmt, args...) dev_dbg(&az_blob_dev.device->device, fmt, ##args)
> +#define az_blob_info(fmt, args...) dev_info(&az_blob_dev.device->device, fmt, ##args)
> +#define az_blob_warn(fmt, args...) dev_warn(&az_blob_dev.device->device, fmt, ##args)
> +#define az_blob_err(fmt, args...) dev_err(&az_blob_dev.device->device, fmt, ##args)
> +
> +static void az_blob_on_channel_callback(void *context)
> +{
> + struct vmbus_channel *channel = (struct vmbus_channel *)context;
> + const struct vmpacket_descriptor *desc;
> +
> + az_blob_dbg("entering interrupt from vmbus\n");
> + foreach_vmbus_pkt(desc, channel) {
> + struct az_blob_vsp_request_ctx *request_ctx;
> + struct az_blob_vsp_response *response;
> + u64 cmd_rqst = vmbus_request_addr(&channel->requestor,
> + desc->trans_id);
> + if (cmd_rqst == VMBUS_RQST_ERROR) {
> + az_blob_err("incorrect transaction id %llu\n",
> + desc->trans_id);
> + continue;
> + }
> + request_ctx = (struct az_blob_vsp_request_ctx *)cmd_rqst;
> + response = hv_pkt_data(desc);
> +
> + az_blob_dbg("got response for request %pUb status %u "
> + "response_len %u\n",
> + &request_ctx->request->guid, response->error,
> + response->response_len);
> + request_ctx->request->response.status = response->error;
> + request_ctx->request->response.response_len =
> + response->response_len;
> + complete(&request_ctx->wait_vsp);
> + }
> +}
> +
> +static int az_blob_fop_open(struct inode *inode, struct file *file)
> +{
> + struct az_blob_file_ctx *file_ctx;
> + unsigned long flags;
> +
> + file_ctx = kzalloc(sizeof(*file_ctx), GFP_KERNEL);
> + if (!file_ctx)
> + return -ENOMEM;
> +
> + rcu_read_lock();
> +
> + if (az_blob_dev.removing) {
> + rcu_read_unlock();
> + kfree(file_ctx);
> + return -ENODEV;
> + }
> +
> + INIT_LIST_HEAD(&file_ctx->vsp_pending_requests);
> + init_waitqueue_head(&file_ctx->wait_vsp_pending);
> + spin_lock_init(&file_ctx->vsp_pending_lock);
> + file->private_data = file_ctx;
> +
> + spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> + list_add_tail(&file_ctx->list, &az_blob_dev.file_list);
> + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
I would think that this function is never called with interrupts
disabled, so the simpler spin_lock()/spin_unlock() functions
could be used.
> +
> + rcu_read_unlock();
> +
> + return 0;
> +}
> +
> +static int az_blob_fop_release(struct inode *inode, struct file *file)
> +{
> + struct az_blob_file_ctx *file_ctx = file->private_data;
> + unsigned long flags;
> +
> + wait_event(file_ctx->wait_vsp_pending,
> + list_empty(&file_ctx->vsp_pending_requests));
> +
> + spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> + list_del(&file_ctx->list);
> + if (list_empty(&az_blob_dev.file_list))
> + wake_up(&az_blob_dev.file_wait);
> + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
I would think that this function is never called with interrupts
disabled, so the simpler spin_lock()/spin_unlock() functions
could be used.
> +
> + kfree(file_ctx);
> +
> + return 0;
> +}
> +
> +static inline bool az_blob_safe_file_access(struct file *file)
> +{
> + return file->f_cred == current_cred() && !uaccess_kernel();
> +}
> +
> +static int get_buffer_pages(int rw, void __user *buffer, u32 buffer_len,
> + struct page ***ppages, size_t *start,
> + size_t *num_pages)
> +{
> + struct iovec iov;
> + struct iov_iter iter;
> + int ret;
> + ssize_t result;
> + struct page **pages;
> +
> + ret = import_single_range(rw, buffer, buffer_len, &iov, &iter);
> + if (ret) {
> + az_blob_dbg("request buffer access error %d\n", ret);
> + return ret;
> + }
> + az_blob_dbg("iov_iter type %d offset %lu count %lu nr_segs %lu\n",
> + iter.type, iter.iov_offset, iter.count, iter.nr_segs);
> +
> + result = iov_iter_get_pages_alloc(&iter, &pages, buffer_len, start);
> + if (result < 0) {
> + az_blob_dbg("failed to pin user pages result=%ld\n", result);
> + return result;
> + }
> + if (result != buffer_len) {
> + az_blob_dbg("can't pin user pages requested %d got %ld\n",
> + buffer_len, result);
> + return -EFAULT;
> + }
> +
> + *ppages = pages;
> + *num_pages = (result + *start + PAGE_SIZE - 1) / PAGE_SIZE;
> + return 0;
> +}
> +
> +static void fill_in_page_buffer(u64 *pfn_array, int *index,
> + struct page **pages, unsigned long num_pages)
> +{
> + int i, page_idx = *index;
> +
> + for (i = 0; i < num_pages; i++)
> + pfn_array[page_idx++] = page_to_pfn(pages[i]);
> + *index = page_idx;
> +}
> +
> +static void free_buffer_pages(size_t num_pages, struct page **pages)
> +{
> + unsigned long i;
> +
> + for (i = 0; i < num_pages; i++)
> + if (pages[i])
> + put_page(pages[i]);
> + kvfree(pages);
> +}
> +
> +static long az_blob_ioctl_user_request(struct file *filp, unsigned long arg)
> +{
> + struct az_blob_device *dev = &az_blob_dev;
> + struct az_blob_file_ctx *file_ctx = filp->private_data;
> + struct az_blob_request_sync __user *request_user =
> + (struct az_blob_request_sync __user *)arg;
> + struct az_blob_request_sync request;
> + struct az_blob_vsp_request_ctx request_ctx;
> + unsigned long flags;
> + int ret;
> + size_t request_start, request_num_pages = 0;
> + size_t response_start, response_num_pages = 0;
> + size_t data_start, data_num_pages = 0, total_num_pages;
> + struct page **request_pages = NULL, **response_pages = NULL;
> + struct page **data_pages = NULL;
> + struct vmbus_packet_mpb_array *desc;
> + u64 *pfn_array;
> + int desc_size;
> + int page_idx;
> + struct az_blob_vsp_request *vsp_request;
> +
> + /* Fast fail if device is being removed */
> + if (dev->removing)
> + return -ENODEV;
> +
> + if (!az_blob_safe_file_access(filp)) {
> + az_blob_dbg("process %d(%s) changed security contexts after"
> + " opening file descriptor\n",
> + task_tgid_vnr(current), current->comm);
> + return -EACCES;
> + }
> +
> + if (copy_from_user(&request, request_user, sizeof(request))) {
> + az_blob_dbg("don't have permission to user provided buffer\n");
> + return -EFAULT;
> + }
> +
> + az_blob_dbg("az_blob ioctl request guid %pUb timeout %u request_len %u"
> + " response_len %u data_len %u request_buffer %llx "
> + "response_buffer %llx data_buffer %llx\n",
> + &request.guid, request.timeout, request.request_len,
> + request.response_len, request.data_len, request.request_buffer,
> + request.response_buffer, request.data_buffer);
> +
> + if (!request.request_len || !request.response_len)
> + return -EINVAL;
> +
> + if (request.data_len && request.data_len < request.data_valid)
> + return -EINVAL;
> +
> + if (request.data_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES ||
> + request.request_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES ||
> + request.response_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES)
> + return -EINVAL;
> +
> + init_completion(&request_ctx.wait_vsp);
> + request_ctx.request = &request;
> +
> + /*
> + * Need to set rw to READ to have page table set up for passing to VSP.
> + * Setting it to WRITE will cause the page table entry not allocated
> + * as it's waiting on Copy-On-Write on next page fault. This doesn't
> + * work in this scenario because VSP wants all the pages to be present.
> + */
> + ret = get_buffer_pages(READ, (void __user *)request.request_buffer,
> + request.request_len, &request_pages,
> + &request_start, &request_num_pages);
> + if (ret)
> + goto get_user_page_failed;
> +
> + ret = get_buffer_pages(READ, (void __user *)request.response_buffer,
> + request.response_len, &response_pages,
> + &response_start, &response_num_pages);
> + if (ret)
> + goto get_user_page_failed;
> +
> + if (request.data_len) {
> + ret = get_buffer_pages(READ,
> + (void __user *)request.data_buffer,
> + request.data_len, &data_pages,
> + &data_start, &data_num_pages);
> + if (ret)
> + goto get_user_page_failed;
> + }
> +
> + total_num_pages = request_num_pages + response_num_pages +
> + data_num_pages;
> + if (total_num_pages > AZ_BLOB_MAX_PAGES) {
> + az_blob_dbg("number of DMA pages %lu buffer exceeding %u\n",
> + total_num_pages, AZ_BLOB_MAX_PAGES);
> + ret = -EINVAL;
> + goto get_user_page_failed;
> + }
> +
> + /* Construct a VMBUS packet and send it over to VSP */
> + desc_size = struct_size(desc, range.pfn_array, total_num_pages);
> + desc = kzalloc(desc_size, GFP_KERNEL);
> + vsp_request = kzalloc(sizeof(*vsp_request), GFP_KERNEL);
> + if (!desc || !vsp_request) {
> + kfree(desc);
> + kfree(vsp_request);
> + ret = -ENOMEM;
> + goto get_user_page_failed;
> + }
> +
> + desc->range.offset = 0;
> + desc->range.len = total_num_pages * PAGE_SIZE;
> + pfn_array = desc->range.pfn_array;
> + page_idx = 0;
> +
> + if (request.data_len) {
> + fill_in_page_buffer(pfn_array, &page_idx, data_pages,
> + data_num_pages);
> + vsp_request->data_buffer_offset = data_start;
> + vsp_request->data_buffer_length = request.data_len;
> + vsp_request->data_buffer_valid = request.data_valid;
> + }
> +
> + fill_in_page_buffer(pfn_array, &page_idx, request_pages,
> + request_num_pages);
> + vsp_request->request_buffer_offset = request_start +
> + data_num_pages * PAGE_SIZE;
> + vsp_request->request_buffer_length = request.request_len;
> +
> + fill_in_page_buffer(pfn_array, &page_idx, response_pages,
> + response_num_pages);
> + vsp_request->response_buffer_offset = response_start +
> + (data_num_pages + request_num_pages) * PAGE_SIZE;
> + vsp_request->response_buffer_length = request.response_len;
> +
> + vsp_request->version = 0;
> + vsp_request->timeout_ms = request.timeout;
> + vsp_request->operation_type = AZ_BLOB_DRIVER_USER_REQUEST;
> + guid_copy(&vsp_request->transaction_id, &request.guid);
> +
> + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> + list_add_tail(&request_ctx.list, &file_ctx->vsp_pending_requests);
> + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
I would think that this function is never called with interrupts
disabled, so the simpler spin_lock()/spin_unlock() functions
could be used.
> +
> + az_blob_dbg("sending request to VSP\n");
> + az_blob_dbg("desc_size %u desc->range.len %u desc->range.offset %u\n",
> + desc_size, desc->range.len, desc->range.offset);
> + az_blob_dbg("vsp_request data_buffer_offset %u data_buffer_length %u "
> + "data_buffer_valid %u request_buffer_offset %u "
> + "request_buffer_length %u response_buffer_offset %u "
> + "response_buffer_length %u\n",
> + vsp_request->data_buffer_offset,
> + vsp_request->data_buffer_length,
> + vsp_request->data_buffer_valid,
> + vsp_request->request_buffer_offset,
> + vsp_request->request_buffer_length,
> + vsp_request->response_buffer_offset,
> + vsp_request->response_buffer_length);
> +
> + ret = vmbus_sendpacket_mpb_desc(dev->device->channel, desc, desc_size,
> + vsp_request, sizeof(*vsp_request),
> + (u64)&request_ctx);
> +
> + kfree(desc);
> + kfree(vsp_request);
> + if (ret)
> + goto vmbus_send_failed;
> +
> + wait_for_completion(&request_ctx.wait_vsp);
> +
> + /*
> + * At this point, the response is already written to request
> + * by VMBUS completion handler, copy them to user-mode buffers
> + * and return to user-mode
> + */
> + if (copy_to_user(&request_user->response, &request.response,
> + sizeof(request.response)))
> + ret = -EFAULT;
> +
> +vmbus_send_failed:
> + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> + list_del(&request_ctx.list);
> + if (list_empty(&file_ctx->vsp_pending_requests))
> + wake_up(&file_ctx->wait_vsp_pending);
> + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
I would think that this function is never called with interrupts
disabled, so the simpler spin_lock()/spin_unlock() functions
could be used.
> +
> +get_user_page_failed:
> + free_buffer_pages(request_num_pages, request_pages);
> + free_buffer_pages(response_num_pages, response_pages);
> + free_buffer_pages(data_num_pages, data_pages);
> +
> + return ret;
> +}
> +
> +static long az_blob_fop_ioctl(struct file *filp, unsigned int cmd,
> + unsigned long arg)
> +{
> + switch (cmd) {
> + case IOCTL_AZ_BLOB_DRIVER_USER_REQUEST:
> + return az_blob_ioctl_user_request(filp, arg);
> +
> + default:
> + az_blob_dbg("unrecognized IOCTL code %u\n", cmd);
> + }
> +
> + return -EINVAL;
> +}
> +
> +static const struct file_operations az_blob_client_fops = {
> + .owner = THIS_MODULE,
> + .open = az_blob_fop_open,
> + .unlocked_ioctl = az_blob_fop_ioctl,
> + .release = az_blob_fop_release,
> +};
> +
> +static struct miscdevice az_blob_misc_device = {
> + MISC_DYNAMIC_MINOR,
> + "azure_blob",
> + &az_blob_client_fops,
> +};
> +
> +static int az_blob_show_pending_requests(struct seq_file *m, void *v)
> +{
> + unsigned long flags, flags2;
> + struct az_blob_vsp_request_ctx *request_ctx;
> + struct az_blob_file_ctx *file_ctx;
> +
> + seq_puts(m, "List of pending requests\n");
> + seq_puts(m, "UUID request_len response_len data_len "
> + "request_buffer response_buffer data_buffer\n");
> + spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> + list_for_each_entry(file_ctx, &az_blob_dev.file_list, list) {
> + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags2);
> + list_for_each_entry(request_ctx,
> + &file_ctx->vsp_pending_requests, list) {
> + seq_printf(m, "%pUb ", &request_ctx->request->guid);
> + seq_printf(m, "%u ", request_ctx->request->request_len);
> + seq_printf(m, "%u ",
> + request_ctx->request->response_len);
> + seq_printf(m, "%u ", request_ctx->request->data_len);
> + seq_printf(m, "%llx ",
> + request_ctx->request->request_buffer);
> + seq_printf(m, "%llx ",
> + request_ctx->request->response_buffer);
> + seq_printf(m, "%llx\n",
> + request_ctx->request->data_buffer);
> + }
> + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags2);
> + }
> + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
> +
> + return 0;
> +}
> +
> +static int az_blob_debugfs_open(struct inode *inode, struct file *file)
> +{
> + return single_open(file, az_blob_show_pending_requests, NULL);
> +}
> +
> +static const struct file_operations az_blob_debugfs_fops = {
> + .open = az_blob_debugfs_open,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .release = seq_release
> +};
> +
> +static void az_blob_remove_device(void)
> +{
> + struct dentry *debugfs_root = debugfs_lookup("az_blob", NULL);
> +
> + misc_deregister(&az_blob_misc_device);
> + debugfs_remove_recursive(debugfs_lookup("pending_requests",
> + debugfs_root));
> + debugfs_remove_recursive(debugfs_root);
> + /* At this point, we won't get any requests from user-mode */
> +}
> +
> +static int az_blob_create_device(struct az_blob_device *dev)
> +{
> + int ret;
> + struct dentry *debugfs_root;
> +
> + ret = misc_register(&az_blob_misc_device);
> + if (ret)
> + return ret;
> +
> + debugfs_root = debugfs_create_dir("az_blob", NULL);
> + debugfs_create_file("pending_requests", 0400, debugfs_root, NULL,
> + &az_blob_debugfs_fops);
> +
> + return 0;
> +}
> +
> +static int az_blob_connect_to_vsp(struct hv_device *device, u32 ring_size)
> +{
> + int ret;
> +
> + spin_lock_init(&az_blob_dev.file_lock);
> + INIT_LIST_HEAD(&az_blob_dev.file_list);
> + init_waitqueue_head(&az_blob_dev.file_wait);
> +
> + az_blob_dev.device = device;
> + device->channel->rqstor_size = AZ_BLOB_QUEUE_DEPTH;
> +
> + ret = vmbus_open(device->channel, ring_size, ring_size, NULL, 0,
> + az_blob_on_channel_callback, device->channel);
> +
> + if (ret)
> + return ret;
> +
> + hv_set_drvdata(device, &az_blob_dev);
> +
> + return ret;
> +}
> +
> +static void az_blob_remove_vmbus(struct hv_device *device)
> +{
> + /* At this point, no VSC/VSP traffic is possible over vmbus */
> + hv_set_drvdata(device, NULL);
> + vmbus_close(device->channel);
> +}
> +
> +static int az_blob_probe(struct hv_device *device,
> + const struct hv_vmbus_device_id *dev_id)
> +{
> + int ret;
> +
> + ret = az_blob_connect_to_vsp(device, AZ_BLOB_RING_SIZE);
> + if (ret) {
> + az_blob_err("error connecting to VSP ret=%d\n", ret);
> + return ret;
> + }
> +
> + // create user-mode client library facing device
> + ret = az_blob_create_device(&az_blob_dev);
> + if (ret) {
> + az_blob_err("failed to create device ret=%d\n", ret);
> + az_blob_remove_vmbus(device);
> + return ret;
> + }
> +
> + az_blob_dev.removing = false;
This line seems misplaced. As soon as az_blob_create_device()
returns, some other thread could find the device and open it. You
don't want the az_blob_fop_open() function to find the "removing"
flag set to true. So I think this line should go *before* the call to
az_blob_create_device().
> + az_blob_info("successfully probed device\n");
> +
> + return 0;
> +}
> +
> +static int az_blob_remove(struct hv_device *dev)
> +{
> + az_blob_dev.removing = true;
> + /*
> + * RCU lock guarantees that any future calls to az_blob_fop_open()
> + * can not use device resources while the inode reference of
> + * /dev/azure_blob is being held for that open, and device file is
> + * being removed from /dev.
> + */
> + synchronize_rcu();
I don't think this works as you have described. While it will ensure that
any in-progress RCU read-side critical sections have completed (i.e.,
in az_blob_fop_open() ), it does not prevent new read-side critical sections
from being started. So as soon as synchronize_rcu() is finished, another
thread could find and open the device, and be executing in
az_blob_fop_open().
But it's not clear to me that this (and the rcu_read_locks in az_blob_fop_open)
are really needed anyway. If az_blob_remove_device() is called while one or more
threads have it open, I think that's OK. Or is there a scenario that I'm missing?
> +
> + az_blob_info("removing device\n");
> + az_blob_remove_device();
> +
> + /*
> + * At this point, it's not possible to open more files.
> + * Wait for all the opened files to be released.
> + */
> + wait_event(az_blob_dev.file_wait, list_empty(&az_blob_dev.file_list));
As mentioned in my most recent comments on the previous version of the
code, I'm concerned about waiting for all open files to be released in the case
of a VMbus rescind. We definitely have to wait for all VSP operations to
complete, but that's different from waiting for the files to be closed. The former
depends on Hyper-V being well-behaved and will presumably happen quickly
in the case of a rescind. But the latter depends entirely on user space code
that is out of the kernel's control. The user space process could hang around
for hours or days with the file still open, which would block this function
from completing, and hence block the global work queue.
Thinking about this, the core issue may be that having a single static
instance of az_blob_device is problematic if we allow the device to be
removed (either explicitly with an unbind, or implicitly with a VMbus
rescind) and then re-added. Perhaps what needs to happen is that
the removed device is allowed to continue to exist as long as user
space processes have an open file handle, but they can't perform
any operations because the "removing" flag is set (and stays set).
If the device is re-added, then a new instance of az_blob_device
should be created, and whether or not the old instance is still
hanging around is irrelevant.
> +
> + az_blob_remove_vmbus(dev);
> + return 0;
> +}
> +
> +static struct hv_driver az_blob_drv = {
> + .name = KBUILD_MODNAME,
> + .id_table = id_table,
> + .probe = az_blob_probe,
> + .remove = az_blob_remove,
> + .driver = {
> + .probe_type = PROBE_PREFER_ASYNCHRONOUS,
> + },
> +};
> +
> +static int __init az_blob_drv_init(void)
> +{
> + return vmbus_driver_register(&az_blob_drv);
> +}
> +
> +static void __exit az_blob_drv_exit(void)
> +{
> + vmbus_driver_unregister(&az_blob_drv);
> +}
> +
> +MODULE_LICENSE("Dual BSD/GPL");
> +MODULE_DESCRIPTION("Microsoft Azure Blob driver");
> +module_init(az_blob_drv_init);
> +module_exit(az_blob_drv_exit);
> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> index 705e95d..3095611 100644
> --- a/drivers/hv/channel_mgmt.c
> +++ b/drivers/hv/channel_mgmt.c
> @@ -154,6 +154,13 @@
> .allowed_in_isolated = false,
> },
>
> + /* Azure Blob */
> + { .dev_type = HV_AZURE_BLOB,
> + HV_AZURE_BLOB_GUID,
> + .perf_device = false,
> + .allowed_in_isolated = false,
> + },
> +
> /* Unknown GUID */
> { .dev_type = HV_UNKNOWN,
> .perf_device = false,
> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
> index d1e59db..ac31362 100644
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -772,6 +772,7 @@ enum vmbus_device_type {
> HV_FCOPY,
> HV_BACKUP,
> HV_DM,
> + HV_AZURE_BLOB,
> HV_UNKNOWN,
> };
>
> @@ -1350,6 +1351,14 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
> 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f)
>
> /*
> + * Azure Blob GUID
> + * {0590b792-db64-45cc-81db-b8d70c577c9e}
> + */
> +#define HV_AZURE_BLOB_GUID \
> + .guid = GUID_INIT(0x0590b792, 0xdb64, 0x45cc, 0x81, 0xdb, \
> + 0xb8, 0xd7, 0x0c, 0x57, 0x7c, 0x9e)
> +
> +/*
> * Shutdown GUID
> * {0e0b6031-5213-4934-818b-38d90ced39db}
> */
> diff --git a/include/uapi/misc/azure_blob.h b/include/uapi/misc/azure_blob.h
> new file mode 100644
> index 0000000..f4168679
> --- /dev/null
> +++ b/include/uapi/misc/azure_blob.h
> @@ -0,0 +1,34 @@
> +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH Linux-syscall-note */
> +/* Copyright (c) Microsoft Corporation. */
> +
> +#ifndef _AZ_BLOB_H
> +#define _AZ_BLOB_H
> +
> +#include <linux/kernel.h>
> +#include <linux/uuid.h>
> +
> +/* user-mode sync request sent through ioctl */
> +struct az_blob_request_sync_response {
> + __u32 status;
> + __u32 response_len;
> +};
> +
> +struct az_blob_request_sync {
> + guid_t guid;
> + __u32 timeout;
> + __u32 request_len;
> + __u32 response_len;
> + __u32 data_len;
> + __u32 data_valid;
> + __aligned_u64 request_buffer;
Is there an implied 32-bit padding field before "request_buffer"?
It seems like "yes", since there are five 32-bit field preceding.
Would it be better to explicitly list that padding field?
> + __aligned_u64 response_buffer;
> + __aligned_u64 data_buffer;
> + struct az_blob_request_sync_response response;
> +};
> +
> +#define AZ_BLOB_MAGIC_NUMBER 'R'
> +#define IOCTL_AZ_BLOB_DRIVER_USER_REQUEST \
> + _IOWR(AZ_BLOB_MAGIC_NUMBER, 0xf0, \
> + struct az_blob_request_sync)
> +
> +#endif /* define _AZ_BLOB_H */
> --
> 1.8.3.1
On Fri, Jul 16, 2021 at 03:56:14PM +0000, Michael Kelley wrote:
> > +static int az_blob_remove(struct hv_device *dev)
> > +{
> > + az_blob_dev.removing = true;
> > + /*
> > + * RCU lock guarantees that any future calls to az_blob_fop_open()
> > + * can not use device resources while the inode reference of
> > + * /dev/azure_blob is being held for that open, and device file is
> > + * being removed from /dev.
> > + */
> > + synchronize_rcu();
>
> I don't think this works as you have described. While it will ensure that
> any in-progress RCU read-side critical sections have completed (i.e.,
> in az_blob_fop_open() ), it does not prevent new read-side critical sections
> from being started. So as soon as synchronize_rcu() is finished, another
> thread could find and open the device, and be executing in
> az_blob_fop_open().
>
> But it's not clear to me that this (and the rcu_read_locks in az_blob_fop_open)
> are really needed anyway. If az_blob_remove_device() is called while one or more
> threads have it open, I think that's OK. Or is there a scenario that I'm missing?
This should not be different from any other tiny character device, why
the mess with RCU at all?
> > + az_blob_info("removing device\n");
> > + az_blob_remove_device();
> > +
> > + /*
> > + * At this point, it's not possible to open more files.
> > + * Wait for all the opened files to be released.
> > + */
> > + wait_event(az_blob_dev.file_wait, list_empty(&az_blob_dev.file_list));
>
> As mentioned in my most recent comments on the previous version of the
> code, I'm concerned about waiting for all open files to be released in the case
> of a VMbus rescind. We definitely have to wait for all VSP operations to
> complete, but that's different from waiting for the files to be closed. The former
> depends on Hyper-V being well-behaved and will presumably happen quickly
> in the case of a rescind. But the latter depends entirely on user space code
> that is out of the kernel's control. The user space process could hang around
> for hours or days with the file still open, which would block this function
> from completing, and hence block the global work queue.
>
> Thinking about this, the core issue may be that having a single static
> instance of az_blob_device is problematic if we allow the device to be
> removed (either explicitly with an unbind, or implicitly with a VMbus
> rescind) and then re-added. Perhaps what needs to happen is that
> the removed device is allowed to continue to exist as long as user
> space processes have an open file handle, but they can't perform
> any operations because the "removing" flag is set (and stays set).
> If the device is re-added, then a new instance of az_blob_device
> should be created, and whether or not the old instance is still
> hanging around is irrelevant.
You should never have a single static copy of the device, that was going
to be my first review comment once this all actually got to a place that
made sense to review (which it is not even there yet.) When you do
that, then you have these crazy race issues you speak of. Use the misc
api correctly and you will not have any of these problems, why people
try to make it harder is beyond me...
thanks,
greg k-h
> Subject: RE: [Patch v3 2/3] Drivers: hv: add Azure Blob driver
>
> From: [email protected] <[email protected]> Sent:
> Tuesday, July 13, 2021 7:45 PM
> >
> > Azure Blob storage provides scalable and durable data storage for Azure.
> >
> (https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fazu
> > re.microsoft.com%2Fen-
> us%2Fservices%2Fstorage%2Fblobs%2F&data=04%7
> >
> C01%7Clongli%40microsoft.com%7C852590ffe972466d24b308d948723d8c%7C
> 72f9
> >
> 88bf86f141af91ab2d7cd011db47%7C1%7C0%7C637620477769866945%7CUnk
> nown%7C
> >
> TWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiL
> CJXVC
> >
> I6Mn0%3D%7C1000&sdata=PZYfSVoLanlTv%2Fi%2Bks4a1IMM0cMiRxP2
> poypgs20
> > S7c%3D&reserved=0)
> >
> > This driver adds support for accelerated access to Azure Blob storage.
> > As an alternative to REST APIs, it provides a fast data path that uses
> > host native network stack and secure direct data link for storage server
> access.
> >
> > This driver will be ported to FreeBSD. It's dual licensed for BSD and GPL.
> >
> > Cc: Jonathan Corbet <[email protected]>
> > Cc: "K. Y. Srinivasan" <[email protected]>
> > Cc: Haiyang Zhang <[email protected]>
> > Cc: Stephen Hemminger <[email protected]>
> > Cc: Wei Liu <[email protected]>
> > Cc: Dexuan Cui <[email protected]>
> > Cc: Greg Kroah-Hartman <[email protected]>
> > Cc: Bjorn Andersson <[email protected]>
> > Cc: Hans de Goede <[email protected]>
> > Cc: Dan Williams <[email protected]>
> > Cc: Maximilian Luz <[email protected]>
> > Cc: Mike Rapoport <[email protected]>
> > Cc: Ben Widawsky <[email protected]>
> > Cc: Jiri Slaby <[email protected]>
> > Cc: Andra Paraschiv <[email protected]>
> > Cc: Siddharth Gupta <[email protected]>
> > Cc: Hannes Reinecke <[email protected]>
> > Cc: [email protected]
> > Signed-off-by: Long Li <[email protected]>
> > ---
> > Documentation/userspace-api/ioctl/ioctl-number.rst | 2 +
> > drivers/hv/Kconfig | 10 +
> > drivers/hv/Makefile | 1 +
> > drivers/hv/azure_blob.c | 625 +++++++++++++++++++++
> > drivers/hv/channel_mgmt.c | 7 +
> > include/linux/hyperv.h | 9 +
> > include/uapi/misc/azure_blob.h | 34 ++
> > 7 files changed, 688 insertions(+)
> > create mode 100644 drivers/hv/azure_blob.c create mode 100644
> > include/uapi/misc/azure_blob.h
> >
> > diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst
> > b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > index 9bfc2b5..d3c2a90 100644
> > --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> > +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > @@ -180,6 +180,8 @@ Code Seq# Include File
> Comments
> > 'R' 01 linux/rfkill.h conflict!
> > 'R' C0-DF net/bluetooth/rfcomm.h
> > 'R' E0 uapi/linux/fsl_mc.h
> > +'R' F0-FF uapi/misc/azure_blob.h Microsoft Azure Blob
> driver
> > +
> > +<mailto:[email protected]>
> > 'S' all linux/cdrom.h conflict!
> > 'S' 80-81 scsi/scsi_ioctl.h conflict!
> > 'S' 82-FF scsi/scsi.h conflict!
> > diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index
> > 66c794d..e08b8d3 100644
> > --- a/drivers/hv/Kconfig
> > +++ b/drivers/hv/Kconfig
> > @@ -27,4 +27,14 @@ config HYPERV_BALLOON
> > help
> > Select this option to enable Hyper-V Balloon driver.
> >
> > +config HYPERV_AZURE_BLOB
> > + tristate "Microsoft Azure Blob driver"
> > + depends on HYPERV && X86_64
> > + help
> > + Select this option to enable Microsoft Azure Blob driver.
> > +
> > + This driver supports accelerated Microsoft Azure Blob access.
> > + To compile this driver as a module, choose M here. The module will
> be
> > + called azure_blob.
> > +
> > endmenu
> > diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index
> > 94daf82..a322575 100644
> > --- a/drivers/hv/Makefile
> > +++ b/drivers/hv/Makefile
> > @@ -2,6 +2,7 @@
> > obj-$(CONFIG_HYPERV) += hv_vmbus.o
> > obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
> > obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
> > +obj-$(CONFIG_HYPERV_AZURE_BLOB) += azure_blob.o
> >
> > CFLAGS_hv_trace.o = -I$(src)
> > CFLAGS_hv_balloon.o = -I$(src)
> > diff --git a/drivers/hv/azure_blob.c b/drivers/hv/azure_blob.c new
> > file mode 100644 index 0000000..5367d5e
> > --- /dev/null
> > +++ b/drivers/hv/azure_blob.c
> > @@ -0,0 +1,625 @@
> > +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH
> > +Linux-syscall-note
> > +/* Copyright (c) Microsoft Corporation. */
> > +
> > +#include <uapi/misc/azure_blob.h>
> > +#include <linux/module.h>
> > +#include <linux/device.h>
> > +#include <linux/slab.h>
> > +#include <linux/cred.h>
> > +#include <linux/debugfs.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/hyperv.h>
> > +#include <linux/miscdevice.h>
> > +#include <linux/uio.h>
> > +
> > +struct az_blob_device {
> > + struct hv_device *device;
> > +
> > + /* Opened files maintained by this device */
> > + struct list_head file_list;
> > + /* Lock for protecting file_list */
> > + spinlock_t file_lock;
> > + wait_queue_head_t file_wait;
> > +
> > + bool removing;
> > +};
> > +
> > +/* VSP messages */
> > +enum az_blob_vsp_request_type {
> > + AZ_BLOB_DRIVER_REQUEST_FIRST = 0x100,
> > + AZ_BLOB_DRIVER_USER_REQUEST = 0x100,
> > + AZ_BLOB_DRIVER_REGISTER_BUFFER = 0x101,
> > + AZ_BLOB_DRIVER_DEREGISTER_BUFFER = 0x102, };
> > +
> > +/* VSC->VSP request */
> > +struct az_blob_vsp_request {
> > + u32 version;
> > + u32 timeout_ms;
> > + u32 data_buffer_offset;
> > + u32 data_buffer_length;
> > + u32 data_buffer_valid;
> > + u32 operation_type;
> > + u32 request_buffer_offset;
> > + u32 request_buffer_length;
> > + u32 response_buffer_offset;
> > + u32 response_buffer_length;
> > + guid_t transaction_id;
> > +} __packed;
> > +
> > +/* VSP->VSC response */
> > +struct az_blob_vsp_response {
> > + u32 length;
> > + u32 error;
> > + u32 response_len;
> > +} __packed;
> > +
> > +struct az_blob_vsp_request_ctx {
> > + struct list_head list;
> > + struct completion wait_vsp;
> > + struct az_blob_request_sync *request; };
> > +
> > +struct az_blob_file_ctx {
> > + struct list_head list;
> > +
> > + /* List of pending requests to VSP */
> > + struct list_head vsp_pending_requests;
> > + /* Lock for protecting vsp_pending_requests */
> > + spinlock_t vsp_pending_lock;
> > + wait_queue_head_t wait_vsp_pending;
> > +};
> > +
> > +/* The maximum number of pages we can pass to VSP in a single packet
> > +*/ #define AZ_BLOB_MAX_PAGES 8192
> > +
> > +/* The one and only one */
> > +static struct az_blob_device az_blob_dev;
> > +
> > +/* Ring buffer size in bytes */
> > +#define AZ_BLOB_RING_SIZE (128 * 1024)
> > +
> > +/* System wide device queue depth */
> > +#define AZ_BLOB_QUEUE_DEPTH 1024
> > +
> > +static const struct hv_vmbus_device_id id_table[] = {
> > + { HV_AZURE_BLOB_GUID,
> > + .driver_data = 0
> > + },
> > + { },
> > +};
> > +
> > +#define az_blob_dbg(fmt, args...)
> > +dev_dbg(&az_blob_dev.device->device, fmt, ##args) #define
> > +az_blob_info(fmt, args...) dev_info(&az_blob_dev.device->device, fmt,
> > +##args) #define az_blob_warn(fmt, args...)
> > +dev_warn(&az_blob_dev.device->device, fmt, ##args) #define
> > +az_blob_err(fmt, args...) dev_err(&az_blob_dev.device->device, fmt,
> > +##args)
> > +
> > +static void az_blob_on_channel_callback(void *context) {
> > + struct vmbus_channel *channel = (struct vmbus_channel *)context;
> > + const struct vmpacket_descriptor *desc;
> > +
> > + az_blob_dbg("entering interrupt from vmbus\n");
> > + foreach_vmbus_pkt(desc, channel) {
> > + struct az_blob_vsp_request_ctx *request_ctx;
> > + struct az_blob_vsp_response *response;
> > + u64 cmd_rqst = vmbus_request_addr(&channel->requestor,
> > + desc->trans_id);
> > + if (cmd_rqst == VMBUS_RQST_ERROR) {
> > + az_blob_err("incorrect transaction id %llu\n",
> > + desc->trans_id);
> > + continue;
> > + }
> > + request_ctx = (struct az_blob_vsp_request_ctx *)cmd_rqst;
> > + response = hv_pkt_data(desc);
> > +
> > + az_blob_dbg("got response for request %pUb status %u "
> > + "response_len %u\n",
> > + &request_ctx->request->guid, response->error,
> > + response->response_len);
> > + request_ctx->request->response.status = response->error;
> > + request_ctx->request->response.response_len =
> > + response->response_len;
> > + complete(&request_ctx->wait_vsp);
> > + }
> > +}
> > +
> > +static int az_blob_fop_open(struct inode *inode, struct file *file) {
> > + struct az_blob_file_ctx *file_ctx;
> > + unsigned long flags;
> > +
> > + file_ctx = kzalloc(sizeof(*file_ctx), GFP_KERNEL);
> > + if (!file_ctx)
> > + return -ENOMEM;
> > +
> > + rcu_read_lock();
> > +
> > + if (az_blob_dev.removing) {
> > + rcu_read_unlock();
> > + kfree(file_ctx);
> > + return -ENODEV;
> > + }
> > +
> > + INIT_LIST_HEAD(&file_ctx->vsp_pending_requests);
> > + init_waitqueue_head(&file_ctx->wait_vsp_pending);
> > + spin_lock_init(&file_ctx->vsp_pending_lock);
> > + file->private_data = file_ctx;
> > +
> > + spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> > + list_add_tail(&file_ctx->list, &az_blob_dev.file_list);
> > + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
>
> I would think that this function is never called with interrupts disabled, so the
> simpler spin_lock()/spin_unlock() functions could be used.
>
> > +
> > + rcu_read_unlock();
> > +
> > + return 0;
> > +}
> > +
> > +static int az_blob_fop_release(struct inode *inode, struct file
> > +*file) {
> > + struct az_blob_file_ctx *file_ctx = file->private_data;
> > + unsigned long flags;
> > +
> > + wait_event(file_ctx->wait_vsp_pending,
> > + list_empty(&file_ctx->vsp_pending_requests));
> > +
> > + spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> > + list_del(&file_ctx->list);
> > + if (list_empty(&az_blob_dev.file_list))
> > + wake_up(&az_blob_dev.file_wait);
> > + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
>
> I would think that this function is never called with interrupts disabled, so the
> simpler spin_lock()/spin_unlock() functions could be used.
>
> > +
> > + kfree(file_ctx);
> > +
> > + return 0;
> > +}
> > +
> > +static inline bool az_blob_safe_file_access(struct file *file) {
> > + return file->f_cred == current_cred() && !uaccess_kernel(); }
> > +
> > +static int get_buffer_pages(int rw, void __user *buffer, u32 buffer_len,
> > + struct page ***ppages, size_t *start,
> > + size_t *num_pages)
> > +{
> > + struct iovec iov;
> > + struct iov_iter iter;
> > + int ret;
> > + ssize_t result;
> > + struct page **pages;
> > +
> > + ret = import_single_range(rw, buffer, buffer_len, &iov, &iter);
> > + if (ret) {
> > + az_blob_dbg("request buffer access error %d\n", ret);
> > + return ret;
> > + }
> > + az_blob_dbg("iov_iter type %d offset %lu count %lu nr_segs %lu\n",
> > + iter.type, iter.iov_offset, iter.count, iter.nr_segs);
> > +
> > + result = iov_iter_get_pages_alloc(&iter, &pages, buffer_len, start);
> > + if (result < 0) {
> > + az_blob_dbg("failed to pin user pages result=%ld\n", result);
> > + return result;
> > + }
> > + if (result != buffer_len) {
> > + az_blob_dbg("can't pin user pages requested %d got %ld\n",
> > + buffer_len, result);
> > + return -EFAULT;
> > + }
> > +
> > + *ppages = pages;
> > + *num_pages = (result + *start + PAGE_SIZE - 1) / PAGE_SIZE;
> > + return 0;
> > +}
> > +
> > +static void fill_in_page_buffer(u64 *pfn_array, int *index,
> > + struct page **pages, unsigned long
> num_pages) {
> > + int i, page_idx = *index;
> > +
> > + for (i = 0; i < num_pages; i++)
> > + pfn_array[page_idx++] = page_to_pfn(pages[i]);
> > + *index = page_idx;
> > +}
> > +
> > +static void free_buffer_pages(size_t num_pages, struct page **pages)
> > +{
> > + unsigned long i;
> > +
> > + for (i = 0; i < num_pages; i++)
> > + if (pages[i])
> > + put_page(pages[i]);
> > + kvfree(pages);
> > +}
> > +
> > +static long az_blob_ioctl_user_request(struct file *filp, unsigned
> > +long arg) {
> > + struct az_blob_device *dev = &az_blob_dev;
> > + struct az_blob_file_ctx *file_ctx = filp->private_data;
> > + struct az_blob_request_sync __user *request_user =
> > + (struct az_blob_request_sync __user *)arg;
> > + struct az_blob_request_sync request;
> > + struct az_blob_vsp_request_ctx request_ctx;
> > + unsigned long flags;
> > + int ret;
> > + size_t request_start, request_num_pages = 0;
> > + size_t response_start, response_num_pages = 0;
> > + size_t data_start, data_num_pages = 0, total_num_pages;
> > + struct page **request_pages = NULL, **response_pages = NULL;
> > + struct page **data_pages = NULL;
> > + struct vmbus_packet_mpb_array *desc;
> > + u64 *pfn_array;
> > + int desc_size;
> > + int page_idx;
> > + struct az_blob_vsp_request *vsp_request;
> > +
> > + /* Fast fail if device is being removed */
> > + if (dev->removing)
> > + return -ENODEV;
> > +
> > + if (!az_blob_safe_file_access(filp)) {
> > + az_blob_dbg("process %d(%s) changed security contexts
> after"
> > + " opening file descriptor\n",
> > + task_tgid_vnr(current), current->comm);
> > + return -EACCES;
> > + }
> > +
> > + if (copy_from_user(&request, request_user, sizeof(request))) {
> > + az_blob_dbg("don't have permission to user provided
> buffer\n");
> > + return -EFAULT;
> > + }
> > +
> > + az_blob_dbg("az_blob ioctl request guid %pUb timeout %u
> request_len %u"
> > + " response_len %u data_len %u request_buffer %llx "
> > + "response_buffer %llx data_buffer %llx\n",
> > + &request.guid, request.timeout, request.request_len,
> > + request.response_len, request.data_len,
> request.request_buffer,
> > + request.response_buffer, request.data_buffer);
> > +
> > + if (!request.request_len || !request.response_len)
> > + return -EINVAL;
> > +
> > + if (request.data_len && request.data_len < request.data_valid)
> > + return -EINVAL;
> > +
> > + if (request.data_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES ||
> > + request.request_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES ||
> > + request.response_len > PAGE_SIZE * AZ_BLOB_MAX_PAGES)
> > + return -EINVAL;
> > +
> > + init_completion(&request_ctx.wait_vsp);
> > + request_ctx.request = &request;
> > +
> > + /*
> > + * Need to set rw to READ to have page table set up for passing to
> VSP.
> > + * Setting it to WRITE will cause the page table entry not allocated
> > + * as it's waiting on Copy-On-Write on next page fault. This doesn't
> > + * work in this scenario because VSP wants all the pages to be
> present.
> > + */
> > + ret = get_buffer_pages(READ, (void __user
> *)request.request_buffer,
> > + request.request_len, &request_pages,
> > + &request_start, &request_num_pages);
> > + if (ret)
> > + goto get_user_page_failed;
> > +
> > + ret = get_buffer_pages(READ, (void __user
> *)request.response_buffer,
> > + request.response_len, &response_pages,
> > + &response_start, &response_num_pages);
> > + if (ret)
> > + goto get_user_page_failed;
> > +
> > + if (request.data_len) {
> > + ret = get_buffer_pages(READ,
> > + (void __user *)request.data_buffer,
> > + request.data_len, &data_pages,
> > + &data_start, &data_num_pages);
> > + if (ret)
> > + goto get_user_page_failed;
> > + }
> > +
> > + total_num_pages = request_num_pages + response_num_pages +
> > + data_num_pages;
> > + if (total_num_pages > AZ_BLOB_MAX_PAGES) {
> > + az_blob_dbg("number of DMA pages %lu buffer
> exceeding %u\n",
> > + total_num_pages, AZ_BLOB_MAX_PAGES);
> > + ret = -EINVAL;
> > + goto get_user_page_failed;
> > + }
> > +
> > + /* Construct a VMBUS packet and send it over to VSP */
> > + desc_size = struct_size(desc, range.pfn_array, total_num_pages);
> > + desc = kzalloc(desc_size, GFP_KERNEL);
> > + vsp_request = kzalloc(sizeof(*vsp_request), GFP_KERNEL);
> > + if (!desc || !vsp_request) {
> > + kfree(desc);
> > + kfree(vsp_request);
> > + ret = -ENOMEM;
> > + goto get_user_page_failed;
> > + }
> > +
> > + desc->range.offset = 0;
> > + desc->range.len = total_num_pages * PAGE_SIZE;
> > + pfn_array = desc->range.pfn_array;
> > + page_idx = 0;
> > +
> > + if (request.data_len) {
> > + fill_in_page_buffer(pfn_array, &page_idx, data_pages,
> > + data_num_pages);
> > + vsp_request->data_buffer_offset = data_start;
> > + vsp_request->data_buffer_length = request.data_len;
> > + vsp_request->data_buffer_valid = request.data_valid;
> > + }
> > +
> > + fill_in_page_buffer(pfn_array, &page_idx, request_pages,
> > + request_num_pages);
> > + vsp_request->request_buffer_offset = request_start +
> > + data_num_pages *
> PAGE_SIZE;
> > + vsp_request->request_buffer_length = request.request_len;
> > +
> > + fill_in_page_buffer(pfn_array, &page_idx, response_pages,
> > + response_num_pages);
> > + vsp_request->response_buffer_offset = response_start +
> > + (data_num_pages + request_num_pages) * PAGE_SIZE;
> > + vsp_request->response_buffer_length = request.response_len;
> > +
> > + vsp_request->version = 0;
> > + vsp_request->timeout_ms = request.timeout;
> > + vsp_request->operation_type = AZ_BLOB_DRIVER_USER_REQUEST;
> > + guid_copy(&vsp_request->transaction_id, &request.guid);
> > +
> > + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> > + list_add_tail(&request_ctx.list, &file_ctx->vsp_pending_requests);
> > + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
>
> I would think that this function is never called with interrupts disabled, so the
> simpler spin_lock()/spin_unlock() functions could be used.
>
> > +
> > + az_blob_dbg("sending request to VSP\n");
> > + az_blob_dbg("desc_size %u desc->range.len %u desc-
> >range.offset %u\n",
> > + desc_size, desc->range.len, desc->range.offset);
> > + az_blob_dbg("vsp_request data_buffer_offset %u
> data_buffer_length %u "
> > + "data_buffer_valid %u request_buffer_offset %u "
> > + "request_buffer_length %u response_buffer_offset %u "
> > + "response_buffer_length %u\n",
> > + vsp_request->data_buffer_offset,
> > + vsp_request->data_buffer_length,
> > + vsp_request->data_buffer_valid,
> > + vsp_request->request_buffer_offset,
> > + vsp_request->request_buffer_length,
> > + vsp_request->response_buffer_offset,
> > + vsp_request->response_buffer_length);
> > +
> > + ret = vmbus_sendpacket_mpb_desc(dev->device->channel, desc,
> desc_size,
> > + vsp_request, sizeof(*vsp_request),
> > + (u64)&request_ctx);
> > +
> > + kfree(desc);
> > + kfree(vsp_request);
> > + if (ret)
> > + goto vmbus_send_failed;
> > +
> > + wait_for_completion(&request_ctx.wait_vsp);
> > +
> > + /*
> > + * At this point, the response is already written to request
> > + * by VMBUS completion handler, copy them to user-mode buffers
> > + * and return to user-mode
> > + */
> > + if (copy_to_user(&request_user->response, &request.response,
> > + sizeof(request.response)))
> > + ret = -EFAULT;
> > +
> > +vmbus_send_failed:
> > + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags);
> > + list_del(&request_ctx.list);
> > + if (list_empty(&file_ctx->vsp_pending_requests))
> > + wake_up(&file_ctx->wait_vsp_pending);
> > + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags);
>
> I would think that this function is never called with interrupts disabled, so the
> simpler spin_lock()/spin_unlock() functions could be used.
>
> > +
> > +get_user_page_failed:
> > + free_buffer_pages(request_num_pages, request_pages);
> > + free_buffer_pages(response_num_pages, response_pages);
> > + free_buffer_pages(data_num_pages, data_pages);
> > +
> > + return ret;
> > +}
> > +
> > +static long az_blob_fop_ioctl(struct file *filp, unsigned int cmd,
> > + unsigned long arg)
> > +{
> > + switch (cmd) {
> > + case IOCTL_AZ_BLOB_DRIVER_USER_REQUEST:
> > + return az_blob_ioctl_user_request(filp, arg);
> > +
> > + default:
> > + az_blob_dbg("unrecognized IOCTL code %u\n", cmd);
> > + }
> > +
> > + return -EINVAL;
> > +}
> > +
> > +static const struct file_operations az_blob_client_fops = {
> > + .owner = THIS_MODULE,
> > + .open = az_blob_fop_open,
> > + .unlocked_ioctl = az_blob_fop_ioctl,
> > + .release = az_blob_fop_release,
> > +};
> > +
> > +static struct miscdevice az_blob_misc_device = {
> > + MISC_DYNAMIC_MINOR,
> > + "azure_blob",
> > + &az_blob_client_fops,
> > +};
> > +
> > +static int az_blob_show_pending_requests(struct seq_file *m, void *v)
> > +{
> > + unsigned long flags, flags2;
> > + struct az_blob_vsp_request_ctx *request_ctx;
> > + struct az_blob_file_ctx *file_ctx;
> > +
> > + seq_puts(m, "List of pending requests\n");
> > + seq_puts(m, "UUID request_len response_len data_len "
> > + "request_buffer response_buffer data_buffer\n");
> > + spin_lock_irqsave(&az_blob_dev.file_lock, flags);
> > + list_for_each_entry(file_ctx, &az_blob_dev.file_list, list) {
> > + spin_lock_irqsave(&file_ctx->vsp_pending_lock, flags2);
> > + list_for_each_entry(request_ctx,
> > + &file_ctx->vsp_pending_requests, list) {
> > + seq_printf(m, "%pUb ", &request_ctx->request-
> >guid);
> > + seq_printf(m, "%u ", request_ctx->request-
> >request_len);
> > + seq_printf(m, "%u ",
> > + request_ctx->request->response_len);
> > + seq_printf(m, "%u ", request_ctx->request-
> >data_len);
> > + seq_printf(m, "%llx ",
> > + request_ctx->request->request_buffer);
> > + seq_printf(m, "%llx ",
> > + request_ctx->request->response_buffer);
> > + seq_printf(m, "%llx\n",
> > + request_ctx->request->data_buffer);
> > + }
> > + spin_unlock_irqrestore(&file_ctx->vsp_pending_lock, flags2);
> > + }
> > + spin_unlock_irqrestore(&az_blob_dev.file_lock, flags);
> > +
> > + return 0;
> > +}
> > +
> > +static int az_blob_debugfs_open(struct inode *inode, struct file
> > +*file) {
> > + return single_open(file, az_blob_show_pending_requests, NULL); }
> > +
> > +static const struct file_operations az_blob_debugfs_fops = {
> > + .open = az_blob_debugfs_open,
> > + .read = seq_read,
> > + .llseek = seq_lseek,
> > + .release = seq_release
> > +};
> > +
> > +static void az_blob_remove_device(void) {
> > + struct dentry *debugfs_root = debugfs_lookup("az_blob", NULL);
> > +
> > + misc_deregister(&az_blob_misc_device);
> > + debugfs_remove_recursive(debugfs_lookup("pending_requests",
> > + debugfs_root));
> > + debugfs_remove_recursive(debugfs_root);
> > + /* At this point, we won't get any requests from user-mode */ }
> > +
> > +static int az_blob_create_device(struct az_blob_device *dev) {
> > + int ret;
> > + struct dentry *debugfs_root;
> > +
> > + ret = misc_register(&az_blob_misc_device);
> > + if (ret)
> > + return ret;
> > +
> > + debugfs_root = debugfs_create_dir("az_blob", NULL);
> > + debugfs_create_file("pending_requests", 0400, debugfs_root, NULL,
> > + &az_blob_debugfs_fops);
> > +
> > + return 0;
> > +}
> > +
> > +static int az_blob_connect_to_vsp(struct hv_device *device, u32
> > +ring_size) {
> > + int ret;
> > +
> > + spin_lock_init(&az_blob_dev.file_lock);
> > + INIT_LIST_HEAD(&az_blob_dev.file_list);
> > + init_waitqueue_head(&az_blob_dev.file_wait);
> > +
> > + az_blob_dev.device = device;
> > + device->channel->rqstor_size = AZ_BLOB_QUEUE_DEPTH;
> > +
> > + ret = vmbus_open(device->channel, ring_size, ring_size, NULL, 0,
> > + az_blob_on_channel_callback, device->channel);
> > +
> > + if (ret)
> > + return ret;
> > +
> > + hv_set_drvdata(device, &az_blob_dev);
> > +
> > + return ret;
> > +}
> > +
> > +static void az_blob_remove_vmbus(struct hv_device *device) {
> > + /* At this point, no VSC/VSP traffic is possible over vmbus */
> > + hv_set_drvdata(device, NULL);
> > + vmbus_close(device->channel);
> > +}
> > +
> > +static int az_blob_probe(struct hv_device *device,
> > + const struct hv_vmbus_device_id *dev_id) {
> > + int ret;
> > +
> > + ret = az_blob_connect_to_vsp(device, AZ_BLOB_RING_SIZE);
> > + if (ret) {
> > + az_blob_err("error connecting to VSP ret=%d\n", ret);
> > + return ret;
> > + }
> > +
> > + // create user-mode client library facing device
> > + ret = az_blob_create_device(&az_blob_dev);
> > + if (ret) {
> > + az_blob_err("failed to create device ret=%d\n", ret);
> > + az_blob_remove_vmbus(device);
> > + return ret;
> > + }
> > +
> > + az_blob_dev.removing = false;
>
> This line seems misplaced. As soon as az_blob_create_device() returns,
> some other thread could find the device and open it. You don't want the
> az_blob_fop_open() function to find the "removing"
> flag set to true. So I think this line should go *before* the call to
> az_blob_create_device().
>
> > + az_blob_info("successfully probed device\n");
> > +
> > + return 0;
> > +}
> > +
> > +static int az_blob_remove(struct hv_device *dev) {
> > + az_blob_dev.removing = true;
> > + /*
> > + * RCU lock guarantees that any future calls to az_blob_fop_open()
> > + * can not use device resources while the inode reference of
> > + * /dev/azure_blob is being held for that open, and device file is
> > + * being removed from /dev.
> > + */
> > + synchronize_rcu();
>
> I don't think this works as you have described. While it will ensure that any
> in-progress RCU read-side critical sections have completed (i.e., in
> az_blob_fop_open() ), it does not prevent new read-side critical sections
> from being started. So as soon as synchronize_rcu() is finished, another
> thread could find and open the device, and be executing in
> az_blob_fop_open().
>
> But it's not clear to me that this (and the rcu_read_locks in
> az_blob_fop_open) are really needed anyway. If az_blob_remove_device()
> is called while one or more threads have it open, I think that's OK. Or is there
> a scenario that I'm missing?
I was trying to address your comment earlier. Here were your comments (1 - 7):
1) The driver is unbound, so az_blob_remove() is called.
2) az_blob_remove() sets the "removing" flag to true, and calls az_blob_remove_device().
3) az_blob_remove_device() waits for the file_list to become empty.
4) After the file_list becomes empty, but before misc_deregister() is called, a separate thread opens the device again.
5) In the separate thread, az_blob_fop_open() obtains the file_lock spin lock.
6) Before az_blob_fop_open() releases the spin lock, az
block_remove_device() completes, along with az_blob_remove().
7) Then the device gets rebound, and az_blob_connect_to_vsp() gets called, all while az_blob_fop_open() still holds the spin lock. So the spin lock get re-initialized while it is held.
Between step 4 and step 5, I don't see any guarantee that az_blob_fop_open() can't run concurrently on another CPU after misc_deregister() finishes. misc_deregister() calls devtmpfs_delete_node() to remove the device file from /dev/*, but it doesn't check the return value, so the inode reference number can be non-zero after it returns, somebody may still try to open it. This check guarantees that the code can't reference any driver's internal data structures. az_blob_dev.removing is set so this code can't be entered. Resetting it after az_blob_create_device() is also for this purpose.
>
> > +
> > + az_blob_info("removing device\n");
> > + az_blob_remove_device();
> > +
> > + /*
> > + * At this point, it's not possible to open more files.
> > + * Wait for all the opened files to be released.
> > + */
> > + wait_event(az_blob_dev.file_wait,
> > +list_empty(&az_blob_dev.file_list));
>
> As mentioned in my most recent comments on the previous version of the
> code, I'm concerned about waiting for all open files to be released in the case
> of a VMbus rescind. We definitely have to wait for all VSP operations to
> complete, but that's different from waiting for the files to be closed. The
> former depends on Hyper-V being well-behaved and will presumably
> happen quickly in the case of a rescind. But the latter depends entirely on
> user space code that is out of the kernel's control. The user space process
> could hang around for hours or days with the file still open, which would
> block this function from completing, and hence block the global work queue.
>
> Thinking about this, the core issue may be that having a single static instance
> of az_blob_device is problematic if we allow the device to be removed
> (either explicitly with an unbind, or implicitly with a VMbus
> rescind) and then re-added. Perhaps what needs to happen is that the
> removed device is allowed to continue to exist as long as user space
> processes have an open file handle, but they can't perform any operations
> because the "removing" flag is set (and stays set).
> If the device is re-added, then a new instance of az_blob_device should be
> created, and whether or not the old instance is still hanging around is
> irrelevant.
I agree dynamic device objects is the way to go. Will implement this.
>
> > +
> > + az_blob_remove_vmbus(dev);
> > + return 0;
> > +}
> > +
> > +static struct hv_driver az_blob_drv = {
> > + .name = KBUILD_MODNAME,
> > + .id_table = id_table,
> > + .probe = az_blob_probe,
> > + .remove = az_blob_remove,
> > + .driver = {
> > + .probe_type = PROBE_PREFER_ASYNCHRONOUS,
> > + },
> > +};
> > +
> > +static int __init az_blob_drv_init(void) {
> > + return vmbus_driver_register(&az_blob_drv);
> > +}
> > +
> > +static void __exit az_blob_drv_exit(void) {
> > + vmbus_driver_unregister(&az_blob_drv);
> > +}
> > +
> > +MODULE_LICENSE("Dual BSD/GPL");
> > +MODULE_DESCRIPTION("Microsoft Azure Blob driver");
> > +module_init(az_blob_drv_init); module_exit(az_blob_drv_exit);
> > diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> > index 705e95d..3095611 100644
> > --- a/drivers/hv/channel_mgmt.c
> > +++ b/drivers/hv/channel_mgmt.c
> > @@ -154,6 +154,13 @@
> > .allowed_in_isolated = false,
> > },
> >
> > + /* Azure Blob */
> > + { .dev_type = HV_AZURE_BLOB,
> > + HV_AZURE_BLOB_GUID,
> > + .perf_device = false,
> > + .allowed_in_isolated = false,
> > + },
> > +
> > /* Unknown GUID */
> > { .dev_type = HV_UNKNOWN,
> > .perf_device = false,
> > diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index
> > d1e59db..ac31362 100644
> > --- a/include/linux/hyperv.h
> > +++ b/include/linux/hyperv.h
> > @@ -772,6 +772,7 @@ enum vmbus_device_type {
> > HV_FCOPY,
> > HV_BACKUP,
> > HV_DM,
> > + HV_AZURE_BLOB,
> > HV_UNKNOWN,
> > };
> >
> > @@ -1350,6 +1351,14 @@ int vmbus_allocate_mmio(struct resource
> **new, struct hv_device *device_obj,
> > 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f)
> >
> > /*
> > + * Azure Blob GUID
> > + * {0590b792-db64-45cc-81db-b8d70c577c9e}
> > + */
> > +#define HV_AZURE_BLOB_GUID \
> > + .guid = GUID_INIT(0x0590b792, 0xdb64, 0x45cc, 0x81, 0xdb, \
> > + 0xb8, 0xd7, 0x0c, 0x57, 0x7c, 0x9e)
> > +
> > +/*
> > * Shutdown GUID
> > * {0e0b6031-5213-4934-818b-38d90ced39db}
> > */
> > diff --git a/include/uapi/misc/azure_blob.h
> > b/include/uapi/misc/azure_blob.h new file mode 100644 index
> > 0000000..f4168679
> > --- /dev/null
> > +++ b/include/uapi/misc/azure_blob.h
> > @@ -0,0 +1,34 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH
> > +Linux-syscall-note */
> > +/* Copyright (c) Microsoft Corporation. */
> > +
> > +#ifndef _AZ_BLOB_H
> > +#define _AZ_BLOB_H
> > +
> > +#include <linux/kernel.h>
> > +#include <linux/uuid.h>
> > +
> > +/* user-mode sync request sent through ioctl */ struct
> > +az_blob_request_sync_response {
> > + __u32 status;
> > + __u32 response_len;
> > +};
> > +
> > +struct az_blob_request_sync {
> > + guid_t guid;
> > + __u32 timeout;
> > + __u32 request_len;
> > + __u32 response_len;
> > + __u32 data_len;
> > + __u32 data_valid;
> > + __aligned_u64 request_buffer;
>
> Is there an implied 32-bit padding field before "request_buffer"?
> It seems like "yes", since there are five 32-bit field preceding.
> Would it be better to explicitly list that padding field?
>
> > + __aligned_u64 response_buffer;
> > + __aligned_u64 data_buffer;
> > + struct az_blob_request_sync_response response; };
> > +
> > +#define AZ_BLOB_MAGIC_NUMBER 'R'
> > +#define IOCTL_AZ_BLOB_DRIVER_USER_REQUEST \
> > + _IOWR(AZ_BLOB_MAGIC_NUMBER, 0xf0, \
> > + struct az_blob_request_sync)
> > +
> > +#endif /* define _AZ_BLOB_H */
> > --
> > 1.8.3.1
> -----Original Message-----
> From: Greg Kroah-Hartman <[email protected]>
> Sent: Friday, July 16, 2021 10:28 AM
> To: Michael Kelley <[email protected]>
> Cc: [email protected]; [email protected]; linux-
> [email protected]; Long Li <[email protected]>; Jonathan Corbet
> <[email protected]>; KY Srinivasan <[email protected]>; Haiyang Zhang
> <[email protected]>; Stephen Hemminger
> <[email protected]>; Wei Liu <[email protected]>; Dexuan Cui
> <[email protected]>; Bjorn Andersson <[email protected]>;
> Hans de Goede <[email protected]>; Williams, Dan J
> <[email protected]>; Maximilian Luz <[email protected]>;
> Mike Rapoport <[email protected]>; Ben Widawsky
> <[email protected]>; Jiri Slaby <[email protected]>; Andra
> Paraschiv <[email protected]>; Siddharth Gupta
> <[email protected]>; Hannes Reinecke <[email protected]>; linux-
> [email protected]
> Subject: Re: [Patch v3 2/3] Drivers: hv: add Azure Blob driver
>
> On Fri, Jul 16, 2021 at 03:56:14PM +0000, Michael Kelley wrote:
> > > +static int az_blob_remove(struct hv_device *dev) {
> > > + az_blob_dev.removing = true;
> > > + /*
> > > + * RCU lock guarantees that any future calls to az_blob_fop_open()
> > > + * can not use device resources while the inode reference of
> > > + * /dev/azure_blob is being held for that open, and device file is
> > > + * being removed from /dev.
> > > + */
> > > + synchronize_rcu();
> >
> > I don't think this works as you have described. While it will ensure
> > that any in-progress RCU read-side critical sections have completed
> > (i.e., in az_blob_fop_open() ), it does not prevent new read-side
> > critical sections from being started. So as soon as synchronize_rcu()
> > is finished, another thread could find and open the device, and be
> > executing in az_blob_fop_open().
> >
> > But it's not clear to me that this (and the rcu_read_locks in
> > az_blob_fop_open) are really needed anyway. If
> > az_blob_remove_device() is called while one or more threads have it open,
> I think that's OK. Or is there a scenario that I'm missing?
>
> This should not be different from any other tiny character device, why the
> mess with RCU at all?
>
> > > + az_blob_info("removing device\n");
> > > + az_blob_remove_device();
> > > +
> > > + /*
> > > + * At this point, it's not possible to open more files.
> > > + * Wait for all the opened files to be released.
> > > + */
> > > + wait_event(az_blob_dev.file_wait,
> > > +list_empty(&az_blob_dev.file_list));
> >
> > As mentioned in my most recent comments on the previous version of the
> > code, I'm concerned about waiting for all open files to be released in
> > the case of a VMbus rescind. We definitely have to wait for all VSP
> > operations to complete, but that's different from waiting for the
> > files to be closed. The former depends on Hyper-V being well-behaved
> > and will presumably happen quickly in the case of a rescind. But the
> > latter depends entirely on user space code that is out of the kernel's
> > control. The user space process could hang around for hours or days
> > with the file still open, which would block this function from completing,
> and hence block the global work queue.
> >
> > Thinking about this, the core issue may be that having a single static
> > instance of az_blob_device is problematic if we allow the device to be
> > removed (either explicitly with an unbind, or implicitly with a VMbus
> > rescind) and then re-added. Perhaps what needs to happen is that the
> > removed device is allowed to continue to exist as long as user space
> > processes have an open file handle, but they can't perform any
> > operations because the "removing" flag is set (and stays set).
> > If the device is re-added, then a new instance of az_blob_device
> > should be created, and whether or not the old instance is still
> > hanging around is irrelevant.
>
> You should never have a single static copy of the device, that was going to be
> my first review comment once this all actually got to a place that made sense
> to review (which it is not even there yet.) When you do that, then you have
> these crazy race issues you speak of. Use the misc api correctly and you will
> not have any of these problems, why people try to make it harder is beyond
> me...
>
> thanks,
>
> greg k-h
I will address all the comments and send the driver for broader review including
linux-fsdevel and linux-block.
Thanks,
Long
From: Long Li <[email protected]> Sent: Friday, July 16, 2021 12:27 PM
[snip]
> > > +
> > > +static int az_blob_connect_to_vsp(struct hv_device *device, u32
> > > +ring_size) {
> > > + int ret;
> > > +
> > > + spin_lock_init(&az_blob_dev.file_lock);
> > > + INIT_LIST_HEAD(&az_blob_dev.file_list);
> > > + init_waitqueue_head(&az_blob_dev.file_wait);
> > > +
> > > + az_blob_dev.device = device;
> > > + device->channel->rqstor_size = AZ_BLOB_QUEUE_DEPTH;
> > > +
> > > + ret = vmbus_open(device->channel, ring_size, ring_size, NULL, 0,
> > > + az_blob_on_channel_callback, device->channel);
> > > +
> > > + if (ret)
> > > + return ret;
> > > +
> > > + hv_set_drvdata(device, &az_blob_dev);
> > > +
> > > + return ret;
> > > +}
> > > +
> > > +static void az_blob_remove_vmbus(struct hv_device *device) {
> > > + /* At this point, no VSC/VSP traffic is possible over vmbus */
> > > + hv_set_drvdata(device, NULL);
> > > + vmbus_close(device->channel);
> > > +}
> > > +
> > > +static int az_blob_probe(struct hv_device *device,
> > > + const struct hv_vmbus_device_id *dev_id) {
> > > + int ret;
> > > +
> > > + ret = az_blob_connect_to_vsp(device, AZ_BLOB_RING_SIZE);
> > > + if (ret) {
> > > + az_blob_err("error connecting to VSP ret=%d\n", ret);
> > > + return ret;
> > > + }
> > > +
> > > + // create user-mode client library facing device
> > > + ret = az_blob_create_device(&az_blob_dev);
> > > + if (ret) {
> > > + az_blob_err("failed to create device ret=%d\n", ret);
> > > + az_blob_remove_vmbus(device);
> > > + return ret;
> > > + }
> > > +
> > > + az_blob_dev.removing = false;
> >
> > This line seems misplaced. As soon as az_blob_create_device() returns,
> > some other thread could find the device and open it. You don't want the
> > az_blob_fop_open() function to find the "removing"
> > flag set to true. So I think this line should go *before* the call to
> > az_blob_create_device().
> >
> > > + az_blob_info("successfully probed device\n");
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static int az_blob_remove(struct hv_device *dev) {
> > > + az_blob_dev.removing = true;
> > > + /*
> > > + * RCU lock guarantees that any future calls to az_blob_fop_open()
> > > + * can not use device resources while the inode reference of
> > > + * /dev/azure_blob is being held for that open, and device file is
> > > + * being removed from /dev.
> > > + */
> > > + synchronize_rcu();
> >
> > I don't think this works as you have described. While it will ensure that any
> > in-progress RCU read-side critical sections have completed (i.e., in
> > az_blob_fop_open() ), it does not prevent new read-side critical sections
> > from being started. So as soon as synchronize_rcu() is finished, another
> > thread could find and open the device, and be executing in
> > az_blob_fop_open().
> >
> > But it's not clear to me that this (and the rcu_read_locks in
> > az_blob_fop_open) are really needed anyway. If az_blob_remove_device()
> > is called while one or more threads have it open, I think that's OK. Or is there
> > a scenario that I'm missing?
>
> I was trying to address your comment earlier. Here were your comments (1 - 7):
>
> 1) The driver is unbound, so az_blob_remove() is called.
> 2) az_blob_remove() sets the "removing" flag to true, and calls az_blob_remove_device().
> 3) az_blob_remove_device() waits for the file_list to become empty.
> 4) After the file_list becomes empty, but before misc_deregister() is called, a separate thread opens the device again.
> 5) In the separate thread, az_blob_fop_open() obtains the file_lock spin lock.
> 6) Before az_blob_fop_open() releases the spin lock, az
> block_remove_device() completes, along with az_blob_remove().
> 7) Then the device gets rebound, and az_blob_connect_to_vsp() gets called, all while az_blob_fop_open() still holds the
> spin lock. So the spin lock get re-initialized while it is held.
>
> Between step 4 and step 5, I don't see any guarantee that az_blob_fop_open() can't run concurrently on another CPU
> after misc_deregister() finishes. misc_deregister() calls devtmpfs_delete_node() to remove the device file from /dev/*,
> but it doesn't check the return value, so the inode reference number can be non-zero after it returns, somebody may still
> try to open it.
I'm no expert here, but once misc_deregister() finishes, I would expect that
any attempt to open the device with the assigned major:minor number will
fail. However, existing opens may still be active. So another thread could still
have it open, but no new opens can be done. As coded in this version of your
patch, az_blob_remove() waits until all those existing opens have been closed,
so everything is clean once the wait is complete. Of course, while waiting
here, the threads with the open fd could try to start another operation
against the VSP, but the "removing" flag will prevent that. So the only
access these threads will have is to the singleton az_blob_dev instance
and the list of open files.
But as noted previously, waiting here for the opens to be closed is
problematic because the wait could be arbitrarily long. And there's
messiness if the device gets re-added later, because there's no
distinction between the old instantiation that a thread might still have
open vs. the new instantiation that you want to initialize fresh. That's
where creating a new dynamic instance will solve any problems.
> This check guarantees that the code can't reference any driver's internal data structures.
> az_blob_dev.removing is set so this code can't be entered. Resetting it after az_blob_create_device() is also for this
> purpose.
>
> >
> > > +
> > > + az_blob_info("removing device\n");
> > > + az_blob_remove_device();
> > > +
> > > + /*
> > > + * At this point, it's not possible to open more files.
> > > + * Wait for all the opened files to be released.
> > > + */
> > > + wait_event(az_blob_dev.file_wait,
> > > +list_empty(&az_blob_dev.file_list));
> >
> > As mentioned in my most recent comments on the previous version of the
> > code, I'm concerned about waiting for all open files to be released in the case
> > of a VMbus rescind. We definitely have to wait for all VSP operations to
> > complete, but that's different from waiting for the files to be closed. The
> > former depends on Hyper-V being well-behaved and will presumably
> > happen quickly in the case of a rescind. But the latter depends entirely on
> > user space code that is out of the kernel's control. The user space process
> > could hang around for hours or days with the file still open, which would
> > block this function from completing, and hence block the global work queue.
> >
> > Thinking about this, the core issue may be that having a single static instance
> > of az_blob_device is problematic if we allow the device to be removed
> > (either explicitly with an unbind, or implicitly with a VMbus
> > rescind) and then re-added. Perhaps what needs to happen is that the
> > removed device is allowed to continue to exist as long as user space
> > processes have an open file handle, but they can't perform any operations
> > because the "removing" flag is set (and stays set).
> > If the device is re-added, then a new instance of az_blob_device should be
> > created, and whether or not the old instance is still hanging around is
> > irrelevant.
>
> I agree dynamic device objects is the way to go. Will implement this.
>
> >
> > > +
> > > + az_blob_remove_vmbus(dev);
> > > + return 0;
> > > +}
> > > +
> > > +static struct hv_driver az_blob_drv = {
> > > + .name = KBUILD_MODNAME,
> > > + .id_table = id_table,
> > > + .probe = az_blob_probe,
> > > + .remove = az_blob_remove,
> > > + .driver = {
> > > + .probe_type = PROBE_PREFER_ASYNCHRONOUS,
> > > + },
> > > +};
> > > +
> > > +static int __init az_blob_drv_init(void) {
> > > + return vmbus_driver_register(&az_blob_drv);
> > > +}
> > > +
> > > +static void __exit az_blob_drv_exit(void) {
> > > + vmbus_driver_unregister(&az_blob_drv);
> > > +}
> > > +
> > > +MODULE_LICENSE("Dual BSD/GPL");
> > > +MODULE_DESCRIPTION("Microsoft Azure Blob driver");
> > > +module_init(az_blob_drv_init); module_exit(az_blob_drv_exit);
> > > diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> > > index 705e95d..3095611 100644
> > > --- a/drivers/hv/channel_mgmt.c
> > > +++ b/drivers/hv/channel_mgmt.c
> > > @@ -154,6 +154,13 @@
> > > .allowed_in_isolated = false,
> > > },
> > >
> > > + /* Azure Blob */
> > > + { .dev_type = HV_AZURE_BLOB,
> > > + HV_AZURE_BLOB_GUID,
> > > + .perf_device = false,
> > > + .allowed_in_isolated = false,
> > > + },
> > > +
> > > /* Unknown GUID */
> > > { .dev_type = HV_UNKNOWN,
> > > .perf_device = false,
> > > diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index
> > > d1e59db..ac31362 100644
> > > --- a/include/linux/hyperv.h
> > > +++ b/include/linux/hyperv.h
> > > @@ -772,6 +772,7 @@ enum vmbus_device_type {
> > > HV_FCOPY,
> > > HV_BACKUP,
> > > HV_DM,
> > > + HV_AZURE_BLOB,
> > > HV_UNKNOWN,
> > > };
> > >
> > > @@ -1350,6 +1351,14 @@ int vmbus_allocate_mmio(struct resource
> > **new, struct hv_device *device_obj,
> > > 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f)
> > >
> > > /*
> > > + * Azure Blob GUID
> > > + * {0590b792-db64-45cc-81db-b8d70c577c9e}
> > > + */
> > > +#define HV_AZURE_BLOB_GUID \
> > > + .guid = GUID_INIT(0x0590b792, 0xdb64, 0x45cc, 0x81, 0xdb, \
> > > + 0xb8, 0xd7, 0x0c, 0x57, 0x7c, 0x9e)
> > > +
> > > +/*
> > > * Shutdown GUID
> > > * {0e0b6031-5213-4934-818b-38d90ced39db}
> > > */
> > > diff --git a/include/uapi/misc/azure_blob.h
> > > b/include/uapi/misc/azure_blob.h new file mode 100644 index
> > > 0000000..f4168679
> > > --- /dev/null
> > > +++ b/include/uapi/misc/azure_blob.h
> > > @@ -0,0 +1,34 @@
> > > +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH
> > > +Linux-syscall-note */
> > > +/* Copyright (c) Microsoft Corporation. */
> > > +
> > > +#ifndef _AZ_BLOB_H
> > > +#define _AZ_BLOB_H
> > > +
> > > +#include <linux/kernel.h>
> > > +#include <linux/uuid.h>
> > > +
> > > +/* user-mode sync request sent through ioctl */ struct
> > > +az_blob_request_sync_response {
> > > + __u32 status;
> > > + __u32 response_len;
> > > +};
> > > +
> > > +struct az_blob_request_sync {
> > > + guid_t guid;
> > > + __u32 timeout;
> > > + __u32 request_len;
> > > + __u32 response_len;
> > > + __u32 data_len;
> > > + __u32 data_valid;
> > > + __aligned_u64 request_buffer;
> >
> > Is there an implied 32-bit padding field before "request_buffer"?
> > It seems like "yes", since there are five 32-bit field preceding.
> > Would it be better to explicitly list that padding field?
> >
> > > + __aligned_u64 response_buffer;
> > > + __aligned_u64 data_buffer;
> > > + struct az_blob_request_sync_response response; };
> > > +
> > > +#define AZ_BLOB_MAGIC_NUMBER 'R'
> > > +#define IOCTL_AZ_BLOB_DRIVER_USER_REQUEST \
> > > + _IOWR(AZ_BLOB_MAGIC_NUMBER, 0xf0, \
> > > + struct az_blob_request_sync)
> > > +
> > > +#endif /* define _AZ_BLOB_H */
> > > --
> > > 1.8.3.1
> Subject: RE: [Patch v3 2/3] Drivers: hv: add Azure Blob driver
>
> From: Long Li <[email protected]> Sent: Friday, July 16, 2021 12:27 PM
>
> [snip]
>
> > > > +
> > > > +static int az_blob_connect_to_vsp(struct hv_device *device, u32
> > > > +ring_size) {
> > > > + int ret;
> > > > +
> > > > + spin_lock_init(&az_blob_dev.file_lock);
> > > > + INIT_LIST_HEAD(&az_blob_dev.file_list);
> > > > + init_waitqueue_head(&az_blob_dev.file_wait);
> > > > +
> > > > + az_blob_dev.device = device;
> > > > + device->channel->rqstor_size = AZ_BLOB_QUEUE_DEPTH;
> > > > +
> > > > + ret = vmbus_open(device->channel, ring_size, ring_size, NULL, 0,
> > > > + az_blob_on_channel_callback, device->channel);
> > > > +
> > > > + if (ret)
> > > > + return ret;
> > > > +
> > > > + hv_set_drvdata(device, &az_blob_dev);
> > > > +
> > > > + return ret;
> > > > +}
> > > > +
> > > > +static void az_blob_remove_vmbus(struct hv_device *device) {
> > > > + /* At this point, no VSC/VSP traffic is possible over vmbus */
> > > > + hv_set_drvdata(device, NULL);
> > > > + vmbus_close(device->channel);
> > > > +}
> > > > +
> > > > +static int az_blob_probe(struct hv_device *device,
> > > > + const struct hv_vmbus_device_id *dev_id) {
> > > > + int ret;
> > > > +
> > > > + ret = az_blob_connect_to_vsp(device, AZ_BLOB_RING_SIZE);
> > > > + if (ret) {
> > > > + az_blob_err("error connecting to VSP ret=%d\n", ret);
> > > > + return ret;
> > > > + }
> > > > +
> > > > + // create user-mode client library facing device
> > > > + ret = az_blob_create_device(&az_blob_dev);
> > > > + if (ret) {
> > > > + az_blob_err("failed to create device ret=%d\n", ret);
> > > > + az_blob_remove_vmbus(device);
> > > > + return ret;
> > > > + }
> > > > +
> > > > + az_blob_dev.removing = false;
> > >
> > > This line seems misplaced. As soon as az_blob_create_device()
> > > returns, some other thread could find the device and open it. You
> > > don't want the
> > > az_blob_fop_open() function to find the "removing"
> > > flag set to true. So I think this line should go *before* the call
> > > to az_blob_create_device().
> > >
> > > > + az_blob_info("successfully probed device\n");
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static int az_blob_remove(struct hv_device *dev) {
> > > > + az_blob_dev.removing = true;
> > > > + /*
> > > > + * RCU lock guarantees that any future calls to az_blob_fop_open()
> > > > + * can not use device resources while the inode reference of
> > > > + * /dev/azure_blob is being held for that open, and device file is
> > > > + * being removed from /dev.
> > > > + */
> > > > + synchronize_rcu();
> > >
> > > I don't think this works as you have described. While it will
> > > ensure that any in-progress RCU read-side critical sections have
> > > completed (i.e., in
> > > az_blob_fop_open() ), it does not prevent new read-side critical
> > > sections from being started. So as soon as synchronize_rcu() is
> > > finished, another thread could find and open the device, and be
> > > executing in az_blob_fop_open().
> > >
> > > But it's not clear to me that this (and the rcu_read_locks in
> > > az_blob_fop_open) are really needed anyway. If
> > > az_blob_remove_device() is called while one or more threads have it
> > > open, I think that's OK. Or is there a scenario that I'm missing?
> >
> > I was trying to address your comment earlier. Here were your comments (1
> - 7):
> >
> > 1) The driver is unbound, so az_blob_remove() is called.
> > 2) az_blob_remove() sets the "removing" flag to true, and calls
> az_blob_remove_device().
> > 3) az_blob_remove_device() waits for the file_list to become empty.
> > 4) After the file_list becomes empty, but before misc_deregister() is called,
> a separate thread opens the device again.
> > 5) In the separate thread, az_blob_fop_open() obtains the file_lock spin
> lock.
> > 6) Before az_blob_fop_open() releases the spin lock, az
> > block_remove_device() completes, along with az_blob_remove().
> > 7) Then the device gets rebound, and az_blob_connect_to_vsp() gets
> > called, all while az_blob_fop_open() still holds the spin lock. So the spin
> lock get re-initialized while it is held.
> >
> > Between step 4 and step 5, I don't see any guarantee that
> > az_blob_fop_open() can't run concurrently on another CPU after
> > misc_deregister() finishes. misc_deregister() calls
> > devtmpfs_delete_node() to remove the device file from /dev/*, but it
> doesn't check the return value, so the inode reference number can be non-
> zero after it returns, somebody may still try to open it.
>
> I'm no expert here, but once misc_deregister() finishes, I would expect that
> any attempt to open the device with the assigned major:minor number will
> fail. However, existing opens may still be active. So another thread could
> still
> have it open, but no new opens can be done. As coded in this version of
> your patch, az_blob_remove() waits until all those existing opens have been
> closed,
> so everything is clean once the wait is complete. Of course, while waiting
> here, the threads with the open fd could try to start another operation
> against the VSP, but the "removing" flag will prevent that. So the only access
> these threads will have is to the singleton az_blob_dev instance and the list
> of open files.
>
> But as noted previously, waiting here for the opens to be closed is
> problematic because the wait could be arbitrarily long. And there's
> messiness if the device gets re-added later, because there's no distinction
> between the old instantiation that a thread might still have open vs. the new
> instantiation that you want to initialize fresh. That's where creating a new
> dynamic instance will solve any problems.
As I said in my previous email, I'm moving to dynamically allocated objects to fix all of these.
>
> > This check guarantees that the code can't reference any driver's internal
> data structures.
> > az_blob_dev.removing is set so this code can't be entered. Resetting
> > it after az_blob_create_device() is also for this purpose.
> >
> > >
> > > > +
> > > > + az_blob_info("removing device\n");
> > > > + az_blob_remove_device();
> > > > +
> > > > + /*
> > > > + * At this point, it's not possible to open more files.
> > > > + * Wait for all the opened files to be released.
> > > > + */
> > > > + wait_event(az_blob_dev.file_wait,
> > > > +list_empty(&az_blob_dev.file_list));
> > >
> > > As mentioned in my most recent comments on the previous version of
> > > the code, I'm concerned about waiting for all open files to be
> > > released in the case of a VMbus rescind. We definitely have to wait
> > > for all VSP operations to complete, but that's different from
> > > waiting for the files to be closed. The former depends on Hyper-V
> > > being well-behaved and will presumably happen quickly in the case of
> > > a rescind. But the latter depends entirely on user space code that
> > > is out of the kernel's control. The user space process could hang
> > > around for hours or days with the file still open, which would block this
> function from completing, and hence block the global work queue.
> > >
> > > Thinking about this, the core issue may be that having a single
> > > static instance of az_blob_device is problematic if we allow the
> > > device to be removed (either explicitly with an unbind, or
> > > implicitly with a VMbus
> > > rescind) and then re-added. Perhaps what needs to happen is that
> > > the removed device is allowed to continue to exist as long as user
> > > space processes have an open file handle, but they can't perform any
> > > operations because the "removing" flag is set (and stays set).
> > > If the device is re-added, then a new instance of az_blob_device
> > > should be created, and whether or not the old instance is still
> > > hanging around is irrelevant.
> >
> > I agree dynamic device objects is the way to go. Will implement this.
> >
> > >
> > > > +
> > > > + az_blob_remove_vmbus(dev);
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static struct hv_driver az_blob_drv = {
> > > > + .name = KBUILD_MODNAME,
> > > > + .id_table = id_table,
> > > > + .probe = az_blob_probe,
> > > > + .remove = az_blob_remove,
> > > > + .driver = {
> > > > + .probe_type = PROBE_PREFER_ASYNCHRONOUS,
> > > > + },
> > > > +};
> > > > +
> > > > +static int __init az_blob_drv_init(void) {
> > > > + return vmbus_driver_register(&az_blob_drv);
> > > > +}
> > > > +
> > > > +static void __exit az_blob_drv_exit(void) {
> > > > + vmbus_driver_unregister(&az_blob_drv);
> > > > +}
> > > > +
> > > > +MODULE_LICENSE("Dual BSD/GPL");
> > > > +MODULE_DESCRIPTION("Microsoft Azure Blob driver");
> > > > +module_init(az_blob_drv_init); module_exit(az_blob_drv_exit);
> > > > diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> > > > index 705e95d..3095611 100644
> > > > --- a/drivers/hv/channel_mgmt.c
> > > > +++ b/drivers/hv/channel_mgmt.c
> > > > @@ -154,6 +154,13 @@
> > > > .allowed_in_isolated = false,
> > > > },
> > > >
> > > > + /* Azure Blob */
> > > > + { .dev_type = HV_AZURE_BLOB,
> > > > + HV_AZURE_BLOB_GUID,
> > > > + .perf_device = false,
> > > > + .allowed_in_isolated = false,
> > > > + },
> > > > +
> > > > /* Unknown GUID */
> > > > { .dev_type = HV_UNKNOWN,
> > > > .perf_device = false,
> > > > diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index
> > > > d1e59db..ac31362 100644
> > > > --- a/include/linux/hyperv.h
> > > > +++ b/include/linux/hyperv.h
> > > > @@ -772,6 +772,7 @@ enum vmbus_device_type {
> > > > HV_FCOPY,
> > > > HV_BACKUP,
> > > > HV_DM,
> > > > + HV_AZURE_BLOB,
> > > > HV_UNKNOWN,
> > > > };
> > > >
> > > > @@ -1350,6 +1351,14 @@ int vmbus_allocate_mmio(struct resource
> > > **new, struct hv_device *device_obj,
> > > > 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f)
> > > >
> > > > /*
> > > > + * Azure Blob GUID
> > > > + * {0590b792-db64-45cc-81db-b8d70c577c9e}
> > > > + */
> > > > +#define HV_AZURE_BLOB_GUID \
> > > > + .guid = GUID_INIT(0x0590b792, 0xdb64, 0x45cc, 0x81, 0xdb, \
> > > > + 0xb8, 0xd7, 0x0c, 0x57, 0x7c, 0x9e)
> > > > +
> > > > +/*
> > > > * Shutdown GUID
> > > > * {0e0b6031-5213-4934-818b-38d90ced39db}
> > > > */
> > > > diff --git a/include/uapi/misc/azure_blob.h
> > > > b/include/uapi/misc/azure_blob.h new file mode 100644 index
> > > > 0000000..f4168679
> > > > --- /dev/null
> > > > +++ b/include/uapi/misc/azure_blob.h
> > > > @@ -0,0 +1,34 @@
> > > > +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-only WITH
> > > > +Linux-syscall-note */
> > > > +/* Copyright (c) Microsoft Corporation. */
> > > > +
> > > > +#ifndef _AZ_BLOB_H
> > > > +#define _AZ_BLOB_H
> > > > +
> > > > +#include <linux/kernel.h>
> > > > +#include <linux/uuid.h>
> > > > +
> > > > +/* user-mode sync request sent through ioctl */ struct
> > > > +az_blob_request_sync_response {
> > > > + __u32 status;
> > > > + __u32 response_len;
> > > > +};
> > > > +
> > > > +struct az_blob_request_sync {
> > > > + guid_t guid;
> > > > + __u32 timeout;
> > > > + __u32 request_len;
> > > > + __u32 response_len;
> > > > + __u32 data_len;
> > > > + __u32 data_valid;
> > > > + __aligned_u64 request_buffer;
> > >
> > > Is there an implied 32-bit padding field before "request_buffer"?
> > > It seems like "yes", since there are five 32-bit field preceding.
> > > Would it be better to explicitly list that padding field?
> > >
> > > > + __aligned_u64 response_buffer;
> > > > + __aligned_u64 data_buffer;
> > > > + struct az_blob_request_sync_response response; };
> > > > +
> > > > +#define AZ_BLOB_MAGIC_NUMBER 'R'
> > > > +#define IOCTL_AZ_BLOB_DRIVER_USER_REQUEST \
> > > > + _IOWR(AZ_BLOB_MAGIC_NUMBER, 0xf0, \
> > > > + struct az_blob_request_sync)
> > > > +
> > > > +#endif /* define _AZ_BLOB_H */
> > > > --
> > > > 1.8.3.1