Tom,
Since we use INTX_DISABLE internally for PCI 2.3 devices, it's probably
not a good idea to allow vfio users direct access to it. In trying to
virtualize it, I stumbled on some config space virtualization issues.
I think we want vconfig and the value written to hardware to be different
when there are virtualized bits, but we lump them together. I think this
is why I've never been able to rely on the memory enable bit of config
space through vfio. With this first patch, all virtualized bits that
are writable are sticky in vconfig. Non-writable bits come from either
the old value or the physical hardware depending on whether it's
virtualized. This means we can get rid of a lot of duplicated setting
and reading of vconfig, and only add code for more complicated
behaviors. This is tricky code, to please double check that I'm not
doing something stupid. Thanks,
Alex
---
Alex Williamson (2):
vfio: Virtualize PCI_COMMAND_INTX_DISABLE
vfio: Fix config space virtualization
drivers/vfio/vfio_intrs.c | 38 ++++++++----
drivers/vfio/vfio_main.c | 14 ++++
drivers/vfio/vfio_pci_config.c | 131 ++++++++++++----------------------------
include/linux/vfio.h | 4 +
4 files changed, 84 insertions(+), 103 deletions(-)
--
We're currently masking out virtualized bits when updating both
physical device registers and vconfig. I think we really want
vconfig to track virtualized bits, otherwise they're not much
different that unwritable bits.
Signed-off-by: Alex Williamson <[email protected]>
---
drivers/vfio/vfio_pci_config.c | 118 ++++++++++------------------------------
1 files changed, 29 insertions(+), 89 deletions(-)
diff --git a/drivers/vfio/vfio_pci_config.c b/drivers/vfio/vfio_pci_config.c
index 8304316..bb38dbe 100644
--- a/drivers/vfio/vfio_pci_config.c
+++ b/drivers/vfio/vfio_pci_config.c
@@ -820,21 +820,10 @@ static inline int vfio_write_config_byte(struct vfio_dev *vdev,
}
/* handle virtualized fields in the basic config space */
-static u8 vfio_virt_basic(struct vfio_dev *vdev, int write,
+static void vfio_virt_basic(struct vfio_dev *vdev, int write,
u16 pos, u16 off, u8 val, u8 newval)
{
- switch (off) {
- /*
- * vendor and device are virt because they don't
- * show up otherwise for sr-iov vfs
- */
- case PCI_VENDOR_ID:
- case PCI_VENDOR_ID + 1:
- case PCI_DEVICE_ID:
- case PCI_DEVICE_ID + 1:
- /* read only */
- val = vdev->vconfig[pos];
- break;
+ switch (pos) {
case PCI_COMMAND:
/*
* If the real mem or IO enable bits are zero
@@ -842,11 +831,15 @@ static u8 vfio_virt_basic(struct vfio_dev *vdev, int write,
* Restore the real BARs before allowing those
* bits to re-enable
*/
- if (vdev->pdev->is_virtfn)
- val |= PCI_COMMAND_MEMORY;
if (write) {
int upd = 0;
+ if (vfio_read_config_byte(vdev, pos, &val) < 0)
+ return;
+
+ if (vdev->pdev->is_virtfn)
+ val |= PCI_COMMAND_MEMORY;
+
upd = (newval & PCI_COMMAND_MEMORY) >
(val & PCI_COMMAND_MEMORY);
upd += (newval & PCI_COMMAND_IO) >
@@ -856,61 +849,26 @@ static u8 vfio_virt_basic(struct vfio_dev *vdev, int write,
vfio_write_config_byte(vdev, pos, newval);
}
break;
- case PCI_BASE_ADDRESS_0:
- case PCI_BASE_ADDRESS_0+1:
- case PCI_BASE_ADDRESS_0+2:
- case PCI_BASE_ADDRESS_0+3:
- case PCI_BASE_ADDRESS_1:
- case PCI_BASE_ADDRESS_1+1:
- case PCI_BASE_ADDRESS_1+2:
- case PCI_BASE_ADDRESS_1+3:
- case PCI_BASE_ADDRESS_2:
- case PCI_BASE_ADDRESS_2+1:
- case PCI_BASE_ADDRESS_2+2:
- case PCI_BASE_ADDRESS_2+3:
- case PCI_BASE_ADDRESS_3:
- case PCI_BASE_ADDRESS_3+1:
- case PCI_BASE_ADDRESS_3+2:
- case PCI_BASE_ADDRESS_3+3:
- case PCI_BASE_ADDRESS_4:
- case PCI_BASE_ADDRESS_4+1:
- case PCI_BASE_ADDRESS_4+2:
- case PCI_BASE_ADDRESS_4+3:
- case PCI_BASE_ADDRESS_5:
- case PCI_BASE_ADDRESS_5+1:
- case PCI_BASE_ADDRESS_5+2:
- case PCI_BASE_ADDRESS_5+3:
- case PCI_ROM_ADDRESS:
- case PCI_ROM_ADDRESS+1:
- case PCI_ROM_ADDRESS+2:
- case PCI_ROM_ADDRESS+3:
+ case PCI_BASE_ADDRESS_0 ... PCI_BASE_ADDRESS_5 + 3:
+ case PCI_ROM_ADDRESS ... PCI_ROM_ADDRESS + 3:
if (write) {
- vdev->vconfig[pos] = newval;
vdev->bardirty = 1;
} else {
if (vdev->bardirty)
vfio_bar_fixup(vdev);
- val = vdev->vconfig[pos];
}
break;
- default:
- if (write)
- vdev->vconfig[pos] = newval;
- else
- val = vdev->vconfig[pos];
- break;
}
- return val;
}
/*
* handle virtualized fields in msi capability
* easy, except for multiple-msi fields in flags byte
*/
-static u8 vfio_virt_msi(struct vfio_dev *vdev, int write,
+static void vfio_virt_msi(struct vfio_dev *vdev, int write,
u16 pos, u16 off, u8 val, u8 newval)
{
- if (off == PCI_MSI_FLAGS) {
+ if (pos == PCI_MSI_FLAGS) {
u8 num;
if (write) {
@@ -924,18 +882,12 @@ static u8 vfio_virt_msi(struct vfio_dev *vdev, int write,
vfio_write_config_byte(vdev, pos, newval);
} else {
if (vfio_read_config_byte(vdev, pos, &val) < 0)
- return 0;
+ return;
val &= ~PCI_MSI_FLAGS_QMASK;
val |= vdev->msi_qmax << 1;
+ vdev->vconfig[pos] = val;
}
- return val;
}
-
- if (write)
- vdev->vconfig[pos] = newval;
- else
- val = vdev->vconfig[pos];
- return val;
}
static int vfio_config_rwbyte(int write,
@@ -1014,25 +966,25 @@ static int vfio_config_rwbyte(int write,
return 0;
}
+ val = vdev->vconfig[pos];
if (write) {
+ u8 pval, rbits = (~virt) & wr;
+
if (copy_from_user(&newval, buf, 1))
return -EFAULT;
- }
- /*
- * We get here if there are some virt bits
- * handle remaining real bits, if any
- */
- if (~virt) {
- u8 rbits = (~virt) & wr;
- ret = vfio_read_config_byte(vdev, pos, &val);
+ ret = vfio_read_config_byte(vdev, pos, &pval);
if (ret < 0)
return ret;
- if (write && rbits) {
- val &= ~rbits;
- val |= (newval & rbits);
- vfio_write_config_byte(vdev, pos, val);
+
+ if (rbits) {
+ pval &= ~rbits;
+ pval |= (newval & rbits);
+ pci_user_write_config_byte(vdev->pdev, pos, pval);
}
+ vdev->vconfig[pos] = (pval & ~(virt | wr)) |
+ (val & (virt & ~wr)) |
+ (newval & wr);
}
/*
* Now handle entirely virtual fields
@@ -1040,32 +992,20 @@ static int vfio_config_rwbyte(int write,
if (pos < PCI_CFG_SPACE_SIZE) {
switch (cap) {
case PCI_CAP_ID_BASIC: /* virtualize BARs */
- val = vfio_virt_basic(vdev, write,
- pos, off, val, newval);
+ vfio_virt_basic(vdev, write, pos, off, val, newval);
break;
case PCI_CAP_ID_MSI: /* virtualize (parts of) MSI */
- val = vfio_virt_msi(vdev, write,
- pos, off, val, newval);
- break;
- default:
- if (write)
- vdev->vconfig[pos] = newval;
- else
- val = vdev->vconfig[pos];
+ vfio_virt_msi(vdev, write, pos, off, val, newval);
break;
}
} else {
/* no virt fields yet in ecaps */
switch (cap) { /* extended capabilities */
default:
- if (write)
- vdev->vconfig[pos] = newval;
- else
- val = vdev->vconfig[pos];
break;
}
}
- if (!write && copy_to_user(buf, &val, 1))
+ if (!write && copy_to_user(buf, &vdev->vconfig[pos], 1))
return -EFAULT;
return 0;
}
As we use this internally for interrupt control, it's dangerous
to let the user manipulate it. Instead, virtualize it. Also,
de-assert INTX_DISABLE when device is opened, the device reset
doesn't seem to clear this.
Signed-off-by: Alex Williamson <[email protected]>
---
drivers/vfio/vfio_intrs.c | 38 +++++++++++++++++++++++++++-----------
drivers/vfio/vfio_main.c | 14 +++++++++++++-
drivers/vfio/vfio_pci_config.c | 13 +++++++++++--
include/linux/vfio.h | 4 ++++
4 files changed, 55 insertions(+), 14 deletions(-)
diff --git a/drivers/vfio/vfio_intrs.c b/drivers/vfio/vfio_intrs.c
index 73e3deb..ace0195 100644
--- a/drivers/vfio/vfio_intrs.c
+++ b/drivers/vfio/vfio_intrs.c
@@ -47,23 +47,22 @@
/*
* vfio_interrupt - IRQ hardware interrupt handler
*/
-irqreturn_t vfio_interrupt(int irq, void *dev_id)
+irqreturn_t vfio_disable_intx(struct vfio_dev *vdev)
{
- struct vfio_dev *vdev = dev_id;
struct pci_dev *pdev = vdev->pdev;
irqreturn_t ret = IRQ_NONE;
- u32 cmd_status_dword;
- u16 origcmd, newcmd, status;
spin_lock_irq(&vdev->irqlock);
- /* INTX disabled interrupts can still be shared */
if (vdev->irq_disabled) {
spin_unlock_irq(&vdev->irqlock);
return ret;
}
if (vdev->pci_2_3) {
+ u32 cmd_status_dword;
+ u16 origcmd, newcmd, status;
+
pci_block_user_cfg_access(pdev);
/* Read both command and status registers in a single 32-bit
@@ -98,15 +97,10 @@ done:
spin_unlock_irq(&vdev->irqlock);
- if (ret != IRQ_HANDLED)
- return ret;
-
- if (vdev->ev_irq)
- eventfd_signal(vdev->ev_irq, 1);
return ret;
}
-int vfio_irq_eoi(struct vfio_dev *vdev)
+void vfio_enable_intx(struct vfio_dev *vdev)
{
struct pci_dev *pdev = vdev->pdev;
@@ -129,6 +123,28 @@ int vfio_irq_eoi(struct vfio_dev *vdev)
}
spin_unlock_irq(&vdev->irqlock);
+}
+
+irqreturn_t vfio_interrupt(int irq, void *dev_id)
+{
+ struct vfio_dev *vdev = dev_id;
+ irqreturn_t ret = vfio_disable_intx(vdev);
+
+ if (ret != IRQ_HANDLED)
+ return ret;
+
+ if (vdev->ev_irq)
+ eventfd_signal(vdev->ev_irq, 1);
+ return ret;
+}
+
+int vfio_irq_eoi(struct vfio_dev *vdev)
+{
+ /* EOI shouldn't re-enable intx if disabled by INTX_DISABLE */
+ if (vdev->vconfig[PCI_COMMAND+1] & PCI_CMD_INTX_DISABLE_BYTE)
+ return 0;
+
+ vfio_enable_intx(vdev);
return 0;
}
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 3cd3cb8..3f51eae 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -110,8 +110,14 @@ static int vfio_open(struct inode *inode, struct file *filep)
listener->vdev = vdev;
INIT_LIST_HEAD(&listener->dm_list);
if (vdev->listeners == 0) {
+ u16 cmd;
(void) pci_reset_function(vdev->pdev);
msleep(100); /* 100ms for reset recovery */
+ pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd);
+ if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
+ cmd &= ~PCI_COMMAND_INTX_DISABLE;
+ pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
+ }
ret = pci_enable_device(vdev->pdev);
}
if (!ret) {
@@ -172,6 +178,7 @@ static int vfio_release(struct inode *inode, struct file *filep)
free_irq(vdev->pdev->irq, vdev);
eventfd_ctx_put(vdev->ev_irq);
vdev->ev_irq = NULL;
+ vdev->irq_disabled = false;
}
kfree(vdev->vconfig);
vdev->vconfig = NULL;
@@ -391,6 +398,7 @@ static long vfio_unl_ioctl(struct file *filep,
if (vdev->ev_irq) {
eventfd_ctx_put(vdev->ev_irq);
free_irq(pdev->irq, vdev);
+ vdev->irq_disabled = false;
vdev->ev_irq = NULL;
}
if (vdev->ev_msi) { /* irq and msi both use pdev->irq */
@@ -398,11 +406,15 @@ static long vfio_unl_ioctl(struct file *filep,
} else {
if (fd >= 0) {
vdev->ev_irq = eventfd_ctx_fdget(fd);
- if (vdev->ev_irq)
+ if (vdev->ev_irq) {
ret = request_irq(pdev->irq,
vfio_interrupt,
vdev->pci_2_3 ? IRQF_SHARED : 0,
vdev->name, vdev);
+ if (vdev->vconfig[PCI_COMMAND+1] &
+ PCI_CMD_INTX_DISABLE_BYTE)
+ vfio_disable_intx(vdev);
+ }
else
ret = -EINVAL;
}
diff --git a/drivers/vfio/vfio_pci_config.c b/drivers/vfio/vfio_pci_config.c
index bb38dbe..6257070 100644
--- a/drivers/vfio/vfio_pci_config.c
+++ b/drivers/vfio/vfio_pci_config.c
@@ -185,8 +185,8 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
/* for catching resume-after-reset */
p_setw(perm, PCI_COMMAND,
- PCI_COMMAND_MEMORY + PCI_COMMAND_IO,
- ALL_WRITE);
+ PCI_COMMAND_MEMORY + PCI_COMMAND_IO + PCI_COMMAND_INTX_DISABLE,
+ ALL_WRITE);
/* no harm to write */
p_setb(perm, PCI_CACHE_LINE_SIZE, NO_VIRT, ALL_WRITE);
@@ -849,6 +849,15 @@ static void vfio_virt_basic(struct vfio_dev *vdev, int write,
vfio_write_config_byte(vdev, pos, newval);
}
break;
+ case PCI_COMMAND + 1:
+ if (write) {
+ if ((newval & PCI_CMD_INTX_DISABLE_BYTE) &&
+ !(val & PCI_CMD_INTX_DISABLE_BYTE))
+ vfio_disable_intx(vdev);
+ if (!(newval & PCI_CMD_INTX_DISABLE_BYTE) &&
+ (val & PCI_CMD_INTX_DISABLE_BYTE))
+ vfio_enable_intx(vdev);
+ }
case PCI_BASE_ADDRESS_0 ... PCI_BASE_ADDRESS_5 + 3:
case PCI_ROM_ADDRESS ... PCI_ROM_ADDRESS + 3:
if (write) {
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index c26f3b3..28da636 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -163,6 +163,10 @@ int vfio_irq_eoi(struct vfio_dev *);
int vfio_irq_eoi_eventfd(struct vfio_dev *, int);
int vfio_eoi_module_init(void);
void vfio_eoi_module_exit(void);
+irqreturn_t vfio_disable_intx(struct vfio_dev *vdev);
+void vfio_enable_intx(struct vfio_dev *vdev);
+
+#define PCI_CMD_INTX_DISABLE_BYTE (PCI_COMMAND_INTX_DISABLE >> 8)
#endif /* __KERNEL__ */
Alex - I am rejecting these 2 patches.
For patch 1/2, I started with yours and found a couple of problems, but then I
got into the spirit and did a buinch more cleaning up. My patch to follow.
For patch 2/2, the INTX stuff, I don't really see the problem. If the user
turns on the bit, it'll result in at most one more interrupt, right? If he
turns off the bit, then he doesn't want interrupts.
On Friday, November 05, 2010 10:30:15 am Alex Williamson wrote:
> Tom,
>
> Since we use INTX_DISABLE internally for PCI 2.3 devices, it's probably
> not a good idea to allow vfio users direct access to it. In trying to
> virtualize it, I stumbled on some config space virtualization issues.
> I think we want vconfig and the value written to hardware to be different
> when there are virtualized bits, but we lump them together. I think this
> is why I've never been able to rely on the memory enable bit of config
> space through vfio. With this first patch, all virtualized bits that
> are writable are sticky in vconfig. Non-writable bits come from either
> the old value or the physical hardware depending on whether it's
> virtualized. This means we can get rid of a lot of duplicated setting
> and reading of vconfig, and only add code for more complicated
> behaviors. This is tricky code, to please double check that I'm not
> doing something stupid. Thanks,
>
> Alex
>
> ---
>
> Alex Williamson (2):
> vfio: Virtualize PCI_COMMAND_INTX_DISABLE
> vfio: Fix config space virtualization
>
>
> drivers/vfio/vfio_intrs.c | 38 ++++++++----
> drivers/vfio/vfio_main.c | 14 ++++
> drivers/vfio/vfio_pci_config.c | 131
> ++++++++++++---------------------------- include/linux/vfio.h |
> 4 +
> 4 files changed, 84 insertions(+), 103 deletions(-)
On Tue, 2010-11-09 at 16:59 -0800, Tom Lyon wrote:
> Alex - I am rejecting these 2 patches.
>
> For patch 1/2, I started with yours and found a couple of problems, but then I
> got into the spirit and did a buinch more cleaning up. My patch to follow.
Great, I'll take a look.
> For patch 2/2, the INTX stuff, I don't really see the problem. If the user
> turns on the bit, it'll result in at most one more interrupt, right? If he
> turns off the bit, then he doesn't want interrupts.
The scenario I'm thinking of is that an interrupt comes in, VFIO sets
INTX_DISABLE, signals eventfd. We're already in a little bit of a weird
state for a VM because INTX_DISABLE just changed on it's own. The guest
interrupt handler blindly sets INTX_DISABLE again, and services the
interrupt. This has the side effect of sending the emulated APIC EOI,
which ends with VFIO clearing INTX_DISABLE, and now the guest is getting
interrupts it's not expecting.
Another aspect of it is that since the non-PCI-2.3/EOI patches, the VFIO
interrupt handler is wrapped around an irq_disabled check, where
irq_disabled only gets cleared by the EOI interfaces. So userspace
might clear INTX_DISABLE and expect new INTx eventfds, but it won't
happen without an EOI call. If we virtualize INTX_DISABLE, we can allow
userspace to use either the EOI interfaces or (in)directly manipulate
INTX_DISABLE from config space.
I could also virtualize the INTX_DISABLE bit in the qemu VFIO driver but
it gets a little bit tricky that I need to disable the EOI_EVENTFD to be
sure to catch all the EOIs in userspace. Emulating INTX_DISABLE for
non-PCI 2.3 devices is also a little more cumbersome from userspace, but
ultimately I'm not sure how valuable that is anyway. Overall, I figured
the above behavior issues were probably sufficient to implement it for
everyone in the VFIO driver.
Alex