Use the DMA based receive operation instead of the ioread8_rep
based datagram receive when DMA datagrams are supported.
In the receive operation, configure the header to point to the
page aligned VMCI_MAX_DG_SIZE part of the receive buffer
using s/g configuration for the header. This ensures that the
existing dispatch routine can be used with little modification.
Initiate the receive by writing the lower 32 bit of the buffer
to the VMCI_DATA_IN_LOW_ADDR register, and wait for the busy
flag to be changed by the device using a wait queue.
The existing dispatch routine for received datagrams is reused
for the DMA datagrams with a few modifications:
- the receive buffer is always the maximum size for DMA datagrams
(IO ports would try with a shorter buffer first to reduce
overhead of the ioread8_rep operation).
- for DMA datagrams, datagrams are provided contiguous in the
buffer as opposed to IO port datagrams, where they can start
on any page boundary
Reviewed-by: Vishnu Dasa <[email protected]>
Signed-off-by: Jorgen Hansen <[email protected]>
---
drivers/misc/vmw_vmci/vmci_guest.c | 103 ++++++++++++++++++++++-------
1 file changed, 79 insertions(+), 24 deletions(-)
diff --git a/drivers/misc/vmw_vmci/vmci_guest.c b/drivers/misc/vmw_vmci/vmci_guest.c
index ca73a1913404..2bcfa292772d 100644
--- a/drivers/misc/vmw_vmci/vmci_guest.c
+++ b/drivers/misc/vmw_vmci/vmci_guest.c
@@ -58,6 +58,7 @@ struct vmci_guest_device {
struct tasklet_struct datagram_tasklet;
struct tasklet_struct bm_tasklet;
+ struct wait_queue_head inout_wq;
void *data_buffer;
dma_addr_t data_buffer_base;
@@ -115,6 +116,36 @@ void vmci_write_reg(struct vmci_guest_device *dev, u32 val, u32 reg)
iowrite32(val, dev->iobase + reg);
}
+static void vmci_read_data(struct vmci_guest_device *vmci_dev,
+ void *dest, size_t size)
+{
+ if (vmci_dev->mmio_base == NULL)
+ ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
+ dest, size);
+ else {
+ /*
+ * For DMA datagrams, the data_buffer will contain the header on the
+ * first page, followed by the incoming datagram(s) on the following
+ * pages. The header uses an S/G element immediately following the
+ * header on the first page to point to the data area.
+ */
+ struct vmci_data_in_out_header *buffer_header = vmci_dev->data_buffer;
+ struct vmci_sg_elem *sg_array = (struct vmci_sg_elem *)(buffer_header + 1);
+ size_t buffer_offset = dest - vmci_dev->data_buffer;
+
+ buffer_header->opcode = 1;
+ buffer_header->size = 1;
+ buffer_header->busy = 0;
+ sg_array[0].addr = vmci_dev->data_buffer_base + buffer_offset;
+ sg_array[0].size = size;
+
+ vmci_write_reg(vmci_dev, lower_32_bits(vmci_dev->data_buffer_base),
+ VMCI_DATA_IN_LOW_ADDR);
+
+ wait_event(vmci_dev->inout_wq, buffer_header->busy == 1);
+ }
+}
+
int vmci_write_data(struct vmci_guest_device *dev, struct vmci_datagram *dg)
{
int result;
@@ -260,15 +291,17 @@ static int vmci_check_host_caps(struct pci_dev *pdev)
}
/*
- * Reads datagrams from the data in port and dispatches them. We
- * always start reading datagrams into only the first page of the
- * datagram buffer. If the datagrams don't fit into one page, we
- * use the maximum datagram buffer size for the remainder of the
- * invocation. This is a simple heuristic for not penalizing
- * small datagrams.
+ * Reads datagrams from the device and dispatches them. For IO port
+ * based access to the device, we always start reading datagrams into
+ * only the first page of the datagram buffer. If the datagrams don't
+ * fit into one page, we use the maximum datagram buffer size for the
+ * remainder of the invocation. This is a simple heuristic for not
+ * penalizing small datagrams. For DMA-based datagrams, we always
+ * use the maximum datagram buffer size, since there is no performance
+ * penalty for doing so.
*
* This function assumes that it has exclusive access to the data
- * in port for the duration of the call.
+ * in register(s) for the duration of the call.
*/
static void vmci_dispatch_dgs(unsigned long data)
{
@@ -276,23 +309,41 @@ static void vmci_dispatch_dgs(unsigned long data)
u8 *dg_in_buffer = vmci_dev->data_buffer;
struct vmci_datagram *dg;
size_t dg_in_buffer_size = VMCI_MAX_DG_SIZE;
- size_t current_dg_in_buffer_size = PAGE_SIZE;
+ size_t current_dg_in_buffer_size;
size_t remaining_bytes;
+ bool is_io_port = vmci_dev->mmio_base == NULL;
BUILD_BUG_ON(VMCI_MAX_DG_SIZE < PAGE_SIZE);
- ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
- vmci_dev->data_buffer, current_dg_in_buffer_size);
+ if (!is_io_port) {
+ /* For mmio, the first page is used for the header. */
+ dg_in_buffer += PAGE_SIZE;
+
+ /*
+ * For DMA-based datagram operations, there is no performance
+ * penalty for reading the maximum buffer size.
+ */
+ current_dg_in_buffer_size = VMCI_MAX_DG_SIZE;
+ } else {
+ current_dg_in_buffer_size = PAGE_SIZE;
+ }
+ vmci_read_data(vmci_dev, dg_in_buffer, current_dg_in_buffer_size);
dg = (struct vmci_datagram *)dg_in_buffer;
remaining_bytes = current_dg_in_buffer_size;
+ /*
+ * Read through the buffer until an invalid datagram header is
+ * encountered. The exit condition for datagrams read through
+ * VMCI_DATA_IN_ADDR is a bit more complicated, since a datagram
+ * can start on any page boundary in the buffer.
+ */
while (dg->dst.resource != VMCI_INVALID_ID ||
- remaining_bytes > PAGE_SIZE) {
+ (is_io_port && remaining_bytes > PAGE_SIZE)) {
unsigned dg_in_size;
/*
- * When the input buffer spans multiple pages, a datagram can
- * start on any page boundary in the buffer.
+ * If using VMCI_DATA_IN_ADDR, skip to the next page
+ * as a datagram can start on any page boundary.
*/
if (dg->dst.resource == VMCI_INVALID_ID) {
dg = (struct vmci_datagram *)roundup(
@@ -342,11 +393,10 @@ static void vmci_dispatch_dgs(unsigned long data)
current_dg_in_buffer_size =
dg_in_buffer_size;
- ioread8_rep(vmci_dev->iobase +
- VMCI_DATA_IN_ADDR,
- vmci_dev->data_buffer +
+ vmci_read_data(vmci_dev,
+ dg_in_buffer +
remaining_bytes,
- current_dg_in_buffer_size -
+ current_dg_in_buffer_size -
remaining_bytes);
}
@@ -384,10 +434,8 @@ static void vmci_dispatch_dgs(unsigned long data)
current_dg_in_buffer_size = dg_in_buffer_size;
for (;;) {
- ioread8_rep(vmci_dev->iobase +
- VMCI_DATA_IN_ADDR,
- vmci_dev->data_buffer,
- current_dg_in_buffer_size);
+ vmci_read_data(vmci_dev, dg_in_buffer,
+ current_dg_in_buffer_size);
if (bytes_to_skip <= current_dg_in_buffer_size)
break;
@@ -404,8 +452,7 @@ static void vmci_dispatch_dgs(unsigned long data)
if (remaining_bytes < VMCI_DG_HEADERSIZE) {
/* Get the next batch of datagrams. */
- ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
- vmci_dev->data_buffer,
+ vmci_read_data(vmci_dev, dg_in_buffer,
current_dg_in_buffer_size);
dg = (struct vmci_datagram *)dg_in_buffer;
remaining_bytes = current_dg_in_buffer_size;
@@ -463,8 +510,11 @@ static irqreturn_t vmci_interrupt(int irq, void *_dev)
icr &= ~VMCI_ICR_NOTIFICATION;
}
- if (icr & VMCI_ICR_DMA_DATAGRAM)
+
+ if (icr & VMCI_ICR_DMA_DATAGRAM) {
+ wake_up_all(&dev->inout_wq);
icr &= ~VMCI_ICR_DMA_DATAGRAM;
+ }
if (icr != 0)
dev_warn(dev->dev,
@@ -497,6 +547,10 @@ static irqreturn_t vmci_interrupt_bm(int irq, void *_dev)
*/
static irqreturn_t vmci_interrupt_dma_datagram(int irq, void *_dev)
{
+ struct vmci_guest_device *dev = _dev;
+
+ wake_up_all(&dev->inout_wq);
+
return IRQ_HANDLED;
}
@@ -586,6 +640,7 @@ static int vmci_guest_probe_device(struct pci_dev *pdev,
vmci_dispatch_dgs, (unsigned long)vmci_dev);
tasklet_init(&vmci_dev->bm_tasklet,
vmci_process_bitmap, (unsigned long)vmci_dev);
+ init_waitqueue_head(&vmci_dev->inout_wq);
if (mmio_base != NULL) {
vmci_dev->tx_buffer = dma_alloc_coherent(&pdev->dev, VMCI_DMA_DG_BUFFER_SIZE,
--
2.25.1
Hi Jorgen,
On Wed, Feb 02, 2022 at 06:49:10AM -0800, Jorgen Hansen wrote:
> Use the DMA based receive operation instead of the ioread8_rep
> based datagram receive when DMA datagrams are supported.
>
> In the receive operation, configure the header to point to the
> page aligned VMCI_MAX_DG_SIZE part of the receive buffer
> using s/g configuration for the header. This ensures that the
> existing dispatch routine can be used with little modification.
> Initiate the receive by writing the lower 32 bit of the buffer
> to the VMCI_DATA_IN_LOW_ADDR register, and wait for the busy
> flag to be changed by the device using a wait queue.
>
> The existing dispatch routine for received datagrams is reused
> for the DMA datagrams with a few modifications:
> - the receive buffer is always the maximum size for DMA datagrams
> (IO ports would try with a shorter buffer first to reduce
> overhead of the ioread8_rep operation).
> - for DMA datagrams, datagrams are provided contiguous in the
> buffer as opposed to IO port datagrams, where they can start
> on any page boundary
>
> Reviewed-by: Vishnu Dasa <[email protected]>
> Signed-off-by: Jorgen Hansen <[email protected]>
> ---
> drivers/misc/vmw_vmci/vmci_guest.c | 103 ++++++++++++++++++++++-------
> 1 file changed, 79 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/misc/vmw_vmci/vmci_guest.c b/drivers/misc/vmw_vmci/vmci_guest.c
> index ca73a1913404..2bcfa292772d 100644
> --- a/drivers/misc/vmw_vmci/vmci_guest.c
> +++ b/drivers/misc/vmw_vmci/vmci_guest.c
> @@ -58,6 +58,7 @@ struct vmci_guest_device {
>
> struct tasklet_struct datagram_tasklet;
> struct tasklet_struct bm_tasklet;
> + struct wait_queue_head inout_wq;
>
> void *data_buffer;
> dma_addr_t data_buffer_base;
> @@ -115,6 +116,36 @@ void vmci_write_reg(struct vmci_guest_device *dev, u32 val, u32 reg)
> iowrite32(val, dev->iobase + reg);
> }
>
> +static void vmci_read_data(struct vmci_guest_device *vmci_dev,
> + void *dest, size_t size)
> +{
> + if (vmci_dev->mmio_base == NULL)
> + ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
> + dest, size);
> + else {
> + /*
> + * For DMA datagrams, the data_buffer will contain the header on the
> + * first page, followed by the incoming datagram(s) on the following
> + * pages. The header uses an S/G element immediately following the
> + * header on the first page to point to the data area.
> + */
> + struct vmci_data_in_out_header *buffer_header = vmci_dev->data_buffer;
> + struct vmci_sg_elem *sg_array = (struct vmci_sg_elem *)(buffer_header + 1);
> + size_t buffer_offset = dest - vmci_dev->data_buffer;
> +
> + buffer_header->opcode = 1;
> + buffer_header->size = 1;
> + buffer_header->busy = 0;
> + sg_array[0].addr = vmci_dev->data_buffer_base + buffer_offset;
> + sg_array[0].size = size;
> +
> + vmci_write_reg(vmci_dev, lower_32_bits(vmci_dev->data_buffer_base),
> + VMCI_DATA_IN_LOW_ADDR);
> +
> + wait_event(vmci_dev->inout_wq, buffer_header->busy == 1);
Apologies if this has been reported somewhere (or if this change is not
the root cause, I did not bother bisecting, I just inferred based on the
call chain) or if there is a patch for this issue somewhere that I did
not notice.
When CONFIG_DEBUG_ATOMIC_SLEEP is enabled, this call to wait_event()
causes a BUG in dmesg, which I see when booting Fedora Rawhide in VMware
Fusion on my MacBook Air. As far as I can tell, the kernel is vanilla.
[ 20.264639] BUG: sleeping function called from invalid context at drivers/misc/vmw_vmci/vmci_guest.c:145
[ 20.264643] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 762, name: vmtoolsd
[ 20.264645] preempt_count: 101, expected: 0
[ 20.264646] RCU nest depth: 0, expected: 0
[ 20.264647] 1 lock held by vmtoolsd/762:
[ 20.264648] #0: ffff0000874ae440 (sk_lock-AF_VSOCK){+.+.}-{0:0}, at: vsock_connect+0x60/0x330 [vsock]
[ 20.264658] Preemption disabled at:
[ 20.264659] [<ffff80000151d7d8>] vmci_send_datagram+0x44/0xa0 [vmw_vmci]
[ 20.264665] CPU: 0 PID: 762 Comm: vmtoolsd Not tainted 5.19.0-0.rc8.20220727git39c3c396f813.60.fc37.aarch64 #1
[ 20.264667] Hardware name: VMware, Inc. VBSA/VBSA, BIOS VEFI 12/31/2020
[ 20.264668] Call trace:
[ 20.264669] dump_backtrace+0xc4/0x130
[ 20.264672] show_stack+0x24/0x80
[ 20.264673] dump_stack_lvl+0x88/0xb4
[ 20.264676] dump_stack+0x18/0x34
[ 20.264677] __might_resched+0x1a0/0x280
[ 20.264679] __might_sleep+0x58/0x90
[ 20.264681] vmci_read_data+0x74/0x120 [vmw_vmci]
[ 20.264683] vmci_dispatch_dgs+0x64/0x204 [vmw_vmci]
[ 20.264686] tasklet_action_common.constprop.0+0x13c/0x150
[ 20.264688] tasklet_action+0x40/0x50
[ 20.264689] __do_softirq+0x23c/0x6b4
[ 20.264690] __irq_exit_rcu+0x104/0x214
[ 20.264691] irq_exit_rcu+0x1c/0x50
[ 20.264693] el1_interrupt+0x38/0x6c
[ 20.264695] el1h_64_irq_handler+0x18/0x24
[ 20.264696] el1h_64_irq+0x68/0x6c
[ 20.264697] preempt_count_sub+0xa4/0xe0
[ 20.264698] _raw_spin_unlock_irqrestore+0x64/0xb0
[ 20.264701] vmci_send_datagram+0x7c/0xa0 [vmw_vmci]
[ 20.264703] vmci_datagram_dispatch+0x84/0x100 [vmw_vmci]
[ 20.264706] vmci_datagram_send+0x2c/0x40 [vmw_vmci]
[ 20.264709] vmci_transport_send_control_pkt+0xb8/0x120 [vmw_vsock_vmci_transport]
[ 20.264711] vmci_transport_connect+0x40/0x7c [vmw_vsock_vmci_transport]
[ 20.264713] vsock_connect+0x278/0x330 [vsock]
[ 20.264715] __sys_connect_file+0x8c/0xc0
[ 20.264718] __sys_connect+0x84/0xb4
[ 20.264720] __arm64_sys_connect+0x2c/0x3c
[ 20.264721] invoke_syscall+0x78/0x100
[ 20.264723] el0_svc_common.constprop.0+0x68/0x124
[ 20.264724] do_el0_svc+0x38/0x4c
[ 20.264725] el0_svc+0x60/0x180
[ 20.264726] el0t_64_sync_handler+0x11c/0x150
[ 20.264728] el0t_64_sync+0x190/0x194
Attached is the entire dmesg in case it is useful, as I see several
messages with different "preemption disabled at" values.
If there is any additional information I can provide or patches I can
test, please let me know.
Cheers,
Nathan
> + }
> +}
> +
> int vmci_write_data(struct vmci_guest_device *dev, struct vmci_datagram *dg)
> {
> int result;
> @@ -260,15 +291,17 @@ static int vmci_check_host_caps(struct pci_dev *pdev)
> }
>
> /*
> - * Reads datagrams from the data in port and dispatches them. We
> - * always start reading datagrams into only the first page of the
> - * datagram buffer. If the datagrams don't fit into one page, we
> - * use the maximum datagram buffer size for the remainder of the
> - * invocation. This is a simple heuristic for not penalizing
> - * small datagrams.
> + * Reads datagrams from the device and dispatches them. For IO port
> + * based access to the device, we always start reading datagrams into
> + * only the first page of the datagram buffer. If the datagrams don't
> + * fit into one page, we use the maximum datagram buffer size for the
> + * remainder of the invocation. This is a simple heuristic for not
> + * penalizing small datagrams. For DMA-based datagrams, we always
> + * use the maximum datagram buffer size, since there is no performance
> + * penalty for doing so.
> *
> * This function assumes that it has exclusive access to the data
> - * in port for the duration of the call.
> + * in register(s) for the duration of the call.
> */
> static void vmci_dispatch_dgs(unsigned long data)
> {
> @@ -276,23 +309,41 @@ static void vmci_dispatch_dgs(unsigned long data)
> u8 *dg_in_buffer = vmci_dev->data_buffer;
> struct vmci_datagram *dg;
> size_t dg_in_buffer_size = VMCI_MAX_DG_SIZE;
> - size_t current_dg_in_buffer_size = PAGE_SIZE;
> + size_t current_dg_in_buffer_size;
> size_t remaining_bytes;
> + bool is_io_port = vmci_dev->mmio_base == NULL;
>
> BUILD_BUG_ON(VMCI_MAX_DG_SIZE < PAGE_SIZE);
>
> - ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
> - vmci_dev->data_buffer, current_dg_in_buffer_size);
> + if (!is_io_port) {
> + /* For mmio, the first page is used for the header. */
> + dg_in_buffer += PAGE_SIZE;
> +
> + /*
> + * For DMA-based datagram operations, there is no performance
> + * penalty for reading the maximum buffer size.
> + */
> + current_dg_in_buffer_size = VMCI_MAX_DG_SIZE;
> + } else {
> + current_dg_in_buffer_size = PAGE_SIZE;
> + }
> + vmci_read_data(vmci_dev, dg_in_buffer, current_dg_in_buffer_size);
> dg = (struct vmci_datagram *)dg_in_buffer;
> remaining_bytes = current_dg_in_buffer_size;
>
> + /*
> + * Read through the buffer until an invalid datagram header is
> + * encountered. The exit condition for datagrams read through
> + * VMCI_DATA_IN_ADDR is a bit more complicated, since a datagram
> + * can start on any page boundary in the buffer.
> + */
> while (dg->dst.resource != VMCI_INVALID_ID ||
> - remaining_bytes > PAGE_SIZE) {
> + (is_io_port && remaining_bytes > PAGE_SIZE)) {
> unsigned dg_in_size;
>
> /*
> - * When the input buffer spans multiple pages, a datagram can
> - * start on any page boundary in the buffer.
> + * If using VMCI_DATA_IN_ADDR, skip to the next page
> + * as a datagram can start on any page boundary.
> */
> if (dg->dst.resource == VMCI_INVALID_ID) {
> dg = (struct vmci_datagram *)roundup(
> @@ -342,11 +393,10 @@ static void vmci_dispatch_dgs(unsigned long data)
> current_dg_in_buffer_size =
> dg_in_buffer_size;
>
> - ioread8_rep(vmci_dev->iobase +
> - VMCI_DATA_IN_ADDR,
> - vmci_dev->data_buffer +
> + vmci_read_data(vmci_dev,
> + dg_in_buffer +
> remaining_bytes,
> - current_dg_in_buffer_size -
> + current_dg_in_buffer_size -
> remaining_bytes);
> }
>
> @@ -384,10 +434,8 @@ static void vmci_dispatch_dgs(unsigned long data)
> current_dg_in_buffer_size = dg_in_buffer_size;
>
> for (;;) {
> - ioread8_rep(vmci_dev->iobase +
> - VMCI_DATA_IN_ADDR,
> - vmci_dev->data_buffer,
> - current_dg_in_buffer_size);
> + vmci_read_data(vmci_dev, dg_in_buffer,
> + current_dg_in_buffer_size);
> if (bytes_to_skip <= current_dg_in_buffer_size)
> break;
>
> @@ -404,8 +452,7 @@ static void vmci_dispatch_dgs(unsigned long data)
> if (remaining_bytes < VMCI_DG_HEADERSIZE) {
> /* Get the next batch of datagrams. */
>
> - ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
> - vmci_dev->data_buffer,
> + vmci_read_data(vmci_dev, dg_in_buffer,
> current_dg_in_buffer_size);
> dg = (struct vmci_datagram *)dg_in_buffer;
> remaining_bytes = current_dg_in_buffer_size;
> @@ -463,8 +510,11 @@ static irqreturn_t vmci_interrupt(int irq, void *_dev)
> icr &= ~VMCI_ICR_NOTIFICATION;
> }
>
> - if (icr & VMCI_ICR_DMA_DATAGRAM)
> +
> + if (icr & VMCI_ICR_DMA_DATAGRAM) {
> + wake_up_all(&dev->inout_wq);
> icr &= ~VMCI_ICR_DMA_DATAGRAM;
> + }
>
> if (icr != 0)
> dev_warn(dev->dev,
> @@ -497,6 +547,10 @@ static irqreturn_t vmci_interrupt_bm(int irq, void *_dev)
> */
> static irqreturn_t vmci_interrupt_dma_datagram(int irq, void *_dev)
> {
> + struct vmci_guest_device *dev = _dev;
> +
> + wake_up_all(&dev->inout_wq);
> +
> return IRQ_HANDLED;
> }
>
> @@ -586,6 +640,7 @@ static int vmci_guest_probe_device(struct pci_dev *pdev,
> vmci_dispatch_dgs, (unsigned long)vmci_dev);
> tasklet_init(&vmci_dev->bm_tasklet,
> vmci_process_bitmap, (unsigned long)vmci_dev);
> + init_waitqueue_head(&vmci_dev->inout_wq);
>
> if (mmio_base != NULL) {
> vmci_dev->tx_buffer = dma_alloc_coherent(&pdev->dev, VMCI_DMA_DG_BUFFER_SIZE,
> --
> 2.25.1
>
>