QEMU running in a stubdom needs to be able to set INTX_DISABLE, and the
MSI(-X) enable flags in the PCI config space. This adds an attribute
'allow_interrupt_control' which when set for a PCI device allows writes
to this flag(s). The toolstack will need to set this for stubdoms.
When enabled, guest (stubdomain) will be allowed to set relevant enable
flags, but only one at a time - i.e. it refuses to enable more than one
of INTx, MSI, MSI-X at a time.
This functionality is needed only for config space access done by device
model (stubdomain) serving a HVM with the actual PCI device. It is not
necessary and unsafe to enable direct access to those bits for PV domain
with the device attached. For PV domains, there are separate protocol
messages (XEN_PCI_OP_{enable,disable}_{msi,msix}) for this purpose.
Those ops in addition to setting enable bits, also configure MSI(-X) in
dom0 kernel - which is undesirable for PCI passthrough to HVM guests.
This should not introduce any new security issues since a malicious
guest (or stubdom) can already generate MSIs through other ways, see
[1] page 8. Additionally, when qemu runs in dom0, it already have direct
access to those bits.
This is the second iteration of this feature. First was proposed as a
direct Xen interface through a new hypercall, but ultimately it was
rejected by the maintainer, because of mixing pciback and hypercalls for
PCI config space access isn't a good design. Full discussion at [2].
[1]: https://invisiblethingslab.com/resources/2011/Software%20Attacks%20on%20Intel%20VT-d.pdf
[2]: https://xen.markmail.org/thread/smpgpws4umdzizze
[part of the commit message and sysfs handling]
Signed-off-by: Simon Gaiser <[email protected]>
[the rest]
Signed-off-by: Marek Marczykowski-Górecki <[email protected]>
---
I'm not very happy about code duplication regarding MSI/MSI-X/INTx
exclusivity test, but I don't have better ideas how to structure it. Any
suggestions?
---
.../xen/xen-pciback/conf_space_capability.c | 113 ++++++++++++++++++
drivers/xen/xen-pciback/conf_space_header.c | 30 +++++
drivers/xen/xen-pciback/pci_stub.c | 66 ++++++++++
drivers/xen/xen-pciback/pciback.h | 1 +
4 files changed, 210 insertions(+)
diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c
index e5694133ebe5..c5a7c58ff3e3 100644
--- a/drivers/xen/xen-pciback/conf_space_capability.c
+++ b/drivers/xen/xen-pciback/conf_space_capability.c
@@ -189,6 +189,109 @@ static const struct config_field caplist_pm[] = {
{}
};
+static struct msi_msix_field_config {
+ u16 enable_bit; /* bit for enabling MSI/MSI-X */
+ int other_cap; /* the other capability for exclusiveness check */
+} msi_field_config = {
+ .enable_bit = PCI_MSI_FLAGS_ENABLE,
+ .other_cap = PCI_CAP_ID_MSIX,
+}, msix_field_config = {
+ .enable_bit = PCI_MSIX_FLAGS_ENABLE,
+ .other_cap = PCI_CAP_ID_MSI,
+};
+
+static void *msi_field_init(struct pci_dev *dev, int offset)
+{
+ return &msi_field_config;
+}
+
+static void *msix_field_init(struct pci_dev *dev, int offset)
+{
+ return &msix_field_config;
+}
+
+static int msi_msix_flags_write(struct pci_dev *dev, int offset, u16 new_value,
+ void *data)
+{
+ int err;
+ u16 old_value;
+ struct msi_msix_field_config *field_config = data;
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
+ int other_cap_offset;
+ u16 other_cap_enable_bit;
+ u16 other_cap_value;
+
+ if (xen_pcibk_permissive || dev_data->permissive)
+ goto write;
+
+ err = pci_read_config_word(dev, offset, &old_value);
+ if (err)
+ return err;
+
+ if (new_value == old_value)
+ return 0;
+
+ if (!dev_data->allow_interrupt_control ||
+ (new_value ^ old_value) & ~field_config->enable_bit)
+ return PCIBIOS_SET_FAILED;
+
+ if (new_value & field_config->enable_bit) {
+ /* don't allow enabling together with INTx */
+ err = pci_read_config_word(dev, PCI_COMMAND, &other_cap_value);
+ if (err)
+ return err;
+ if (!(other_cap_value & PCI_COMMAND_INTX_DISABLE))
+ return PCIBIOS_SET_FAILED;
+
+ /* and the other MSI(-X) */
+ switch (field_config->other_cap) {
+ case PCI_CAP_ID_MSI:
+ other_cap_offset = dev->msi_cap + PCI_MSI_FLAGS;
+ other_cap_enable_bit = PCI_MSI_FLAGS_ENABLE;
+ break;
+ case PCI_CAP_ID_MSIX:
+ other_cap_offset = dev->msix_cap + PCI_MSIX_FLAGS;
+ other_cap_enable_bit = PCI_MSIX_FLAGS_ENABLE;
+ break;
+ default:
+ BUG_ON(1);
+ }
+ err = pci_read_config_word(dev,
+ other_cap_offset,
+ &other_cap_value);
+ if (err)
+ return err;
+
+ if (other_cap_value & other_cap_enable_bit)
+ return PCIBIOS_SET_FAILED;
+ }
+
+write:
+ return pci_write_config_word(dev, offset, new_value);
+}
+
+static const struct config_field caplist_msix[] = {
+ {
+ .offset = PCI_MSIX_FLAGS,
+ .size = 2,
+ .init = msix_field_init,
+ .u.w.read = xen_pcibk_read_config_word,
+ .u.w.write = msi_msix_flags_write,
+ },
+ {}
+};
+
+static const struct config_field caplist_msi[] = {
+ {
+ .offset = PCI_MSI_FLAGS,
+ .size = 2,
+ .init = msi_field_init,
+ .u.w.read = xen_pcibk_read_config_word,
+ .u.w.write = msi_msix_flags_write,
+ },
+ {}
+};
+
static struct xen_pcibk_config_capability xen_pcibk_config_capability_pm = {
.capability = PCI_CAP_ID_PM,
.fields = caplist_pm,
@@ -197,11 +300,21 @@ static struct xen_pcibk_config_capability xen_pcibk_config_capability_vpd = {
.capability = PCI_CAP_ID_VPD,
.fields = caplist_vpd,
};
+static struct xen_pcibk_config_capability xen_pcibk_config_capability_msi = {
+ .capability = PCI_CAP_ID_MSI,
+ .fields = caplist_msi,
+};
+static struct xen_pcibk_config_capability xen_pcibk_config_capability_msix = {
+ .capability = PCI_CAP_ID_MSIX,
+ .fields = caplist_msix,
+};
int xen_pcibk_config_capability_init(void)
{
register_capability(&xen_pcibk_config_capability_vpd);
register_capability(&xen_pcibk_config_capability_pm);
+ register_capability(&xen_pcibk_config_capability_msi);
+ register_capability(&xen_pcibk_config_capability_msix);
return 0;
}
diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c
index 10ae24b5a76e..1e0fff02e21b 100644
--- a/drivers/xen/xen-pciback/conf_space_header.c
+++ b/drivers/xen/xen-pciback/conf_space_header.c
@@ -64,6 +64,7 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
int err;
u16 val;
struct pci_cmd_info *cmd = data;
+ u16 cap_value;
dev_data = pci_get_drvdata(dev);
if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
@@ -117,6 +118,35 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
pci_clear_mwi(dev);
}
+ if (dev_data && dev_data->allow_interrupt_control) {
+ if (!(cmd->val & PCI_COMMAND_INTX_DISABLE) &&
+ (value & PCI_COMMAND_INTX_DISABLE)) {
+ pci_intx(dev, 0);
+ } else if ((cmd->val & PCI_COMMAND_INTX_DISABLE) &&
+ !(value & PCI_COMMAND_INTX_DISABLE)) {
+ /* Do not allow enabling INTx together with MSI or MSI-X. */
+ /* Do not trust dev->msi(x)_enabled here, as enabling could be done
+ * bypassing the pci_*msi* functions, by the qemu.
+ */
+ err = pci_read_config_word(dev,
+ dev->msi_cap + PCI_MSI_FLAGS,
+ &cap_value);
+ if (!err && (cap_value & PCI_MSI_FLAGS_ENABLE))
+ err = -EBUSY;
+ if (!err)
+ err = pci_read_config_word(dev,
+ dev->msix_cap + PCI_MSIX_FLAGS,
+ &cap_value);
+ if (!err && (cap_value & PCI_MSIX_FLAGS_ENABLE))
+ err = -EBUSY;
+ if (err)
+ pr_warn("%s: cannot enable INTx (%d)\n",
+ pci_name(dev), err);
+ else
+ pci_intx(dev, 1);
+ }
+ }
+
cmd->val = value;
if (!xen_pcibk_permissive && (!dev_data || !dev_data->permissive))
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
index 097410a7cdb7..7af93d65ed51 100644
--- a/drivers/xen/xen-pciback/pci_stub.c
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -304,6 +304,8 @@ void pcistub_put_pci_dev(struct pci_dev *dev)
xen_pcibk_config_reset_dev(dev);
xen_pcibk_config_free_dyn_fields(dev);
+ dev_data->allow_interrupt_control = 0;
+
xen_unregister_device_domain_owner(dev);
spin_lock_irqsave(&found_psdev->lock, flags);
@@ -1431,6 +1433,65 @@ static ssize_t permissive_show(struct device_driver *drv, char *buf)
}
static DRIVER_ATTR_RW(permissive);
+static ssize_t allow_interrupt_control_store(struct device_driver *drv,
+ const char *buf, size_t count)
+{
+ int domain, bus, slot, func;
+ int err;
+ struct pcistub_device *psdev;
+ struct xen_pcibk_dev_data *dev_data;
+
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+
+ psdev = pcistub_device_find(domain, bus, slot, func);
+ if (!psdev) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ dev_data = pci_get_drvdata(psdev->dev);
+ /* the driver data for a device should never be null at this point */
+ if (!dev_data) {
+ err = -ENXIO;
+ goto release;
+ }
+ dev_data->allow_interrupt_control = 1;
+release:
+ pcistub_device_put(psdev);
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+static ssize_t allow_interrupt_control_show(struct device_driver *drv,
+ char *buf)
+{
+ struct pcistub_device *psdev;
+ struct xen_pcibk_dev_data *dev_data;
+ size_t count = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (count >= PAGE_SIZE)
+ break;
+ if (!psdev->dev)
+ continue;
+ dev_data = pci_get_drvdata(psdev->dev);
+ if (!dev_data || !dev_data->allow_interrupt_control)
+ continue;
+ count +=
+ scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
+ pci_name(psdev->dev));
+ }
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return count;
+}
+static DRIVER_ATTR_RW(allow_interrupt_control);
+
static void pcistub_exit(void)
{
driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_new_slot);
@@ -1440,6 +1501,8 @@ static void pcistub_exit(void)
driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_quirks);
driver_remove_file(&xen_pcibk_pci_driver.driver,
&driver_attr_permissive);
+ driver_remove_file(&xen_pcibk_pci_driver.driver,
+ &driver_attr_allow_interrupt_control);
driver_remove_file(&xen_pcibk_pci_driver.driver,
&driver_attr_irq_handlers);
driver_remove_file(&xen_pcibk_pci_driver.driver,
@@ -1530,6 +1593,9 @@ static int __init pcistub_init(void)
if (!err)
err = driver_create_file(&xen_pcibk_pci_driver.driver,
&driver_attr_permissive);
+ if (!err)
+ err = driver_create_file(&xen_pcibk_pci_driver.driver,
+ &driver_attr_allow_interrupt_control);
if (!err)
err = driver_create_file(&xen_pcibk_pci_driver.driver,
diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h
index 263c059bff90..ce1077e32466 100644
--- a/drivers/xen/xen-pciback/pciback.h
+++ b/drivers/xen/xen-pciback/pciback.h
@@ -45,6 +45,7 @@ struct xen_pcibk_dev_data {
struct list_head config_fields;
struct pci_saved_state *pci_saved_state;
unsigned int permissive:1;
+ unsigned int allow_interrupt_control:1;
unsigned int warned_on_write:1;
unsigned int enable_intx:1;
unsigned int isr_on:1; /* Whether the IRQ handler is installed. */
--
2.21.0
On 03.12.2019 06:41, Marek Marczykowski-Górecki wrote:
> @@ -117,6 +118,35 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
> pci_clear_mwi(dev);
> }
>
> + if (dev_data && dev_data->allow_interrupt_control) {
> + if (!(cmd->val & PCI_COMMAND_INTX_DISABLE) &&
> + (value & PCI_COMMAND_INTX_DISABLE)) {
> + pci_intx(dev, 0);
> + } else if ((cmd->val & PCI_COMMAND_INTX_DISABLE) &&
> + !(value & PCI_COMMAND_INTX_DISABLE)) {
> + /* Do not allow enabling INTx together with MSI or MSI-X. */
> + /* Do not trust dev->msi(x)_enabled here, as enabling could be done
> + * bypassing the pci_*msi* functions, by the qemu.
> + */
> + err = pci_read_config_word(dev,
> + dev->msi_cap + PCI_MSI_FLAGS,
> + &cap_value);
> + if (!err && (cap_value & PCI_MSI_FLAGS_ENABLE))
> + err = -EBUSY;
> + if (!err)
> + err = pci_read_config_word(dev,
> + dev->msix_cap + PCI_MSIX_FLAGS,
> + &cap_value);
What about a device without MSI and/or MSI-X? Wouldn't you read
from (close to) config space offset 0 in this case, interpreting
some unrelated bit(s) as the MSI / MSI-X enable one(s)?
Just as an initial implementation related remark. I'm still to
think about the idea as a whole, albeit I find the argument
pretty convincing at the first glance of devices being able to
raise MSI anyway even when disabled as per the config space
setting. (Of course, as with any config space settings, devices
providing backdoor access would open similar avenues.)
Jan
On Tue, Dec 03, 2019 at 06:41:56AM +0100, Marek Marczykowski-G?recki wrote:
> QEMU running in a stubdom needs to be able to set INTX_DISABLE, and the
> MSI(-X) enable flags in the PCI config space. This adds an attribute
> 'allow_interrupt_control' which when set for a PCI device allows writes
> to this flag(s). The toolstack will need to set this for stubdoms.
> When enabled, guest (stubdomain) will be allowed to set relevant enable
> flags, but only one at a time - i.e. it refuses to enable more than one
> of INTx, MSI, MSI-X at a time.
>
> This functionality is needed only for config space access done by device
> model (stubdomain) serving a HVM with the actual PCI device. It is not
> necessary and unsafe to enable direct access to those bits for PV domain
> with the device attached. For PV domains, there are separate protocol
> messages (XEN_PCI_OP_{enable,disable}_{msi,msix}) for this purpose.
> Those ops in addition to setting enable bits, also configure MSI(-X) in
> dom0 kernel - which is undesirable for PCI passthrough to HVM guests.
>
> This should not introduce any new security issues since a malicious
> guest (or stubdom) can already generate MSIs through other ways, see
> [1] page 8. Additionally, when qemu runs in dom0, it already have direct
> access to those bits.
>
> This is the second iteration of this feature. First was proposed as a
> direct Xen interface through a new hypercall, but ultimately it was
> rejected by the maintainer, because of mixing pciback and hypercalls for
> PCI config space access isn't a good design. Full discussion at [2].
>
> [1]: https://invisiblethingslab.com/resources/2011/Software%20Attacks%20on%20Intel%20VT-d.pdf
> [2]: https://xen.markmail.org/thread/smpgpws4umdzizze
>
> [part of the commit message and sysfs handling]
> Signed-off-by: Simon Gaiser <[email protected]>
> [the rest]
> Signed-off-by: Marek Marczykowski-G?recki <[email protected]>
> ---
> I'm not very happy about code duplication regarding MSI/MSI-X/INTx
> exclusivity test, but I don't have better ideas how to structure it. Any
> suggestions?
Can't you create a helper that returns the currently enabled interrupt
mode?
I expect returning an enum (ie: NONE, INTX, MSI, MSIX) should be fine
since no two of those should be enabled at the same time.
> ---
> .../xen/xen-pciback/conf_space_capability.c | 113 ++++++++++++++++++
> drivers/xen/xen-pciback/conf_space_header.c | 30 +++++
> drivers/xen/xen-pciback/pci_stub.c | 66 ++++++++++
> drivers/xen/xen-pciback/pciback.h | 1 +
> 4 files changed, 210 insertions(+)
>
> diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c
> index e5694133ebe5..c5a7c58ff3e3 100644
> --- a/drivers/xen/xen-pciback/conf_space_capability.c
> +++ b/drivers/xen/xen-pciback/conf_space_capability.c
> @@ -189,6 +189,109 @@ static const struct config_field caplist_pm[] = {
> {}
> };
>
> +static struct msi_msix_field_config {
> + u16 enable_bit; /* bit for enabling MSI/MSI-X */
> + int other_cap; /* the other capability for exclusiveness check */
Nit: just one space between the declaration and the comment IMO.
Also capability ID is not a signed value, hence unsigned int would
feel more natural.
> +} msi_field_config = {
> + .enable_bit = PCI_MSI_FLAGS_ENABLE,
> + .other_cap = PCI_CAP_ID_MSIX,
> +}, msix_field_config = {
> + .enable_bit = PCI_MSIX_FLAGS_ENABLE,
> + .other_cap = PCI_CAP_ID_MSI,
> +};
I think it would be more helpful to store the current capability ID
rather the one you need to check against. Then if you had a helper
that returns the currently enabled interrupt mode you would have to
check that either it's NONE or matches the capability requested to be
enabled.
> +
> +static void *msi_field_init(struct pci_dev *dev, int offset)
> +{
> + return &msi_field_config;
> +}
> +
> +static void *msix_field_init(struct pci_dev *dev, int offset)
> +{
> + return &msix_field_config;
> +}
> +
> +static int msi_msix_flags_write(struct pci_dev *dev, int offset, u16 new_value,
> + void *data)
> +{
> + int err;
> + u16 old_value;
> + struct msi_msix_field_config *field_config = data;
> + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
const for both the above.
> + int other_cap_offset;
unsigned int
> + u16 other_cap_enable_bit;
> + u16 other_cap_value;
> +
> + if (xen_pcibk_permissive || dev_data->permissive)
> + goto write;
> +
> + err = pci_read_config_word(dev, offset, &old_value);
> + if (err)
> + return err;
> +
> + if (new_value == old_value)
> + return 0;
> +
> + if (!dev_data->allow_interrupt_control ||
> + (new_value ^ old_value) & ~field_config->enable_bit)
> + return PCIBIOS_SET_FAILED;
> +
> + if (new_value & field_config->enable_bit) {
> + /* don't allow enabling together with INTx */
> + err = pci_read_config_word(dev, PCI_COMMAND, &other_cap_value);
> + if (err)
> + return err;
> + if (!(other_cap_value & PCI_COMMAND_INTX_DISABLE))
> + return PCIBIOS_SET_FAILED;
> +
> + /* and the other MSI(-X) */
> + switch (field_config->other_cap) {
> + case PCI_CAP_ID_MSI:
> + other_cap_offset = dev->msi_cap + PCI_MSI_FLAGS;
> + other_cap_enable_bit = PCI_MSI_FLAGS_ENABLE;
> + break;
> + case PCI_CAP_ID_MSIX:
> + other_cap_offset = dev->msix_cap + PCI_MSIX_FLAGS;
> + other_cap_enable_bit = PCI_MSIX_FLAGS_ENABLE;
> + break;
I think you should check whether the other capability exists. I guess
msi{x}_cap will be 0 if not present?
> + default:
> + BUG_ON(1);
Doesn't Linux have a plain BUG();?
> + }
> + err = pci_read_config_word(dev,
> + other_cap_offset,
> + &other_cap_value);
> + if (err)
> + return err;
> +
> + if (other_cap_value & other_cap_enable_bit)
> + return PCIBIOS_SET_FAILED;
> + }
> +
> +write:
> + return pci_write_config_word(dev, offset, new_value);
> +}
> +
> +static const struct config_field caplist_msix[] = {
> + {
> + .offset = PCI_MSIX_FLAGS,
> + .size = 2,
> + .init = msix_field_init,
> + .u.w.read = xen_pcibk_read_config_word,
> + .u.w.write = msi_msix_flags_write,
> + },
> + {}
> +};
> +
> +static const struct config_field caplist_msi[] = {
> + {
> + .offset = PCI_MSI_FLAGS,
> + .size = 2,
> + .init = msi_field_init,
> + .u.w.read = xen_pcibk_read_config_word,
> + .u.w.write = msi_msix_flags_write,
> + },
> + {}
> +};
> +
> static struct xen_pcibk_config_capability xen_pcibk_config_capability_pm = {
> .capability = PCI_CAP_ID_PM,
> .fields = caplist_pm,
> @@ -197,11 +300,21 @@ static struct xen_pcibk_config_capability xen_pcibk_config_capability_vpd = {
> .capability = PCI_CAP_ID_VPD,
> .fields = caplist_vpd,
> };
> +static struct xen_pcibk_config_capability xen_pcibk_config_capability_msi = {
> + .capability = PCI_CAP_ID_MSI,
> + .fields = caplist_msi,
> +};
> +static struct xen_pcibk_config_capability xen_pcibk_config_capability_msix = {
> + .capability = PCI_CAP_ID_MSIX,
> + .fields = caplist_msix,
> +};
>
> int xen_pcibk_config_capability_init(void)
> {
> register_capability(&xen_pcibk_config_capability_vpd);
> register_capability(&xen_pcibk_config_capability_pm);
> + register_capability(&xen_pcibk_config_capability_msi);
> + register_capability(&xen_pcibk_config_capability_msix);
>
> return 0;
> }
> diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c
> index 10ae24b5a76e..1e0fff02e21b 100644
> --- a/drivers/xen/xen-pciback/conf_space_header.c
> +++ b/drivers/xen/xen-pciback/conf_space_header.c
> @@ -64,6 +64,7 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
> int err;
> u16 val;
> struct pci_cmd_info *cmd = data;
> + u16 cap_value;
>
> dev_data = pci_get_drvdata(dev);
> if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
> @@ -117,6 +118,35 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
> pci_clear_mwi(dev);
> }
>
> + if (dev_data && dev_data->allow_interrupt_control) {
> + if (!(cmd->val & PCI_COMMAND_INTX_DISABLE) &&
> + (value & PCI_COMMAND_INTX_DISABLE)) {
> + pci_intx(dev, 0);
> + } else if ((cmd->val & PCI_COMMAND_INTX_DISABLE) &&
> + !(value & PCI_COMMAND_INTX_DISABLE)) {
> + /* Do not allow enabling INTx together with MSI or MSI-X. */
> + /* Do not trust dev->msi(x)_enabled here, as enabling could be done
> + * bypassing the pci_*msi* functions, by the qemu.
> + */
> + err = pci_read_config_word(dev,
> + dev->msi_cap + PCI_MSI_FLAGS,
> + &cap_value);
> + if (!err && (cap_value & PCI_MSI_FLAGS_ENABLE))
> + err = -EBUSY;
> + if (!err)
> + err = pci_read_config_word(dev,
> + dev->msix_cap + PCI_MSIX_FLAGS,
> + &cap_value);
> + if (!err && (cap_value & PCI_MSIX_FLAGS_ENABLE))
> + err = -EBUSY;
Shouldn't this return PCI-style errors?
I think PCIBIOS_SET_FAILED would be more appropriate here. Note sure
whether you should terminate the function here in that case, or else
the error is lost and not returned to the caller, yet the function
failed at least partially.
> + if (err)
> + pr_warn("%s: cannot enable INTx (%d)\n",
This should be ratelimited since it's guest triggerable.
> + pci_name(dev), err);
> + else
> + pci_intx(dev, 1);
> + }
> + }
> +
> cmd->val = value;
>
> if (!xen_pcibk_permissive && (!dev_data || !dev_data->permissive))
> diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
> index 097410a7cdb7..7af93d65ed51 100644
> --- a/drivers/xen/xen-pciback/pci_stub.c
> +++ b/drivers/xen/xen-pciback/pci_stub.c
> @@ -304,6 +304,8 @@ void pcistub_put_pci_dev(struct pci_dev *dev)
> xen_pcibk_config_reset_dev(dev);
> xen_pcibk_config_free_dyn_fields(dev);
>
> + dev_data->allow_interrupt_control = 0;
Why do you need to do this here? I don't see any other options being
cleared here (I would expect for example permissive to also be
cleared if required).
> +
> xen_unregister_device_domain_owner(dev);
>
> spin_lock_irqsave(&found_psdev->lock, flags);
> @@ -1431,6 +1433,65 @@ static ssize_t permissive_show(struct device_driver *drv, char *buf)
> }
> static DRIVER_ATTR_RW(permissive);
>
> +static ssize_t allow_interrupt_control_store(struct device_driver *drv,
> + const char *buf, size_t count)
> +{
> + int domain, bus, slot, func;
> + int err;
> + struct pcistub_device *psdev;
> + struct xen_pcibk_dev_data *dev_data;
> +
> + err = str_to_slot(buf, &domain, &bus, &slot, &func);
> + if (err)
> + goto out;
> +
> + psdev = pcistub_device_find(domain, bus, slot, func);
> + if (!psdev) {
> + err = -ENODEV;
> + goto out;
> + }
> +
> + dev_data = pci_get_drvdata(psdev->dev);
> + /* the driver data for a device should never be null at this point */
> + if (!dev_data) {
> + err = -ENXIO;
> + goto release;
> + }
> + dev_data->allow_interrupt_control = 1;
> +release:
> + pcistub_device_put(psdev);
> +out:
> + if (!err)
> + err = count;
> + return err;
> +}
> +
> +static ssize_t allow_interrupt_control_show(struct device_driver *drv,
> + char *buf)
> +{
> + struct pcistub_device *psdev;
> + struct xen_pcibk_dev_data *dev_data;
> + size_t count = 0;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&pcistub_devices_lock, flags);
> + list_for_each_entry(psdev, &pcistub_devices, dev_list) {
> + if (count >= PAGE_SIZE)
> + break;
> + if (!psdev->dev)
> + continue;
> + dev_data = pci_get_drvdata(psdev->dev);
> + if (!dev_data || !dev_data->allow_interrupt_control)
> + continue;
> + count +=
> + scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
> + pci_name(psdev->dev));
> + }
> + spin_unlock_irqrestore(&pcistub_devices_lock, flags);
> + return count;
> +}
> +static DRIVER_ATTR_RW(allow_interrupt_control);
This is mostly a clone of permissive_{store/show}, I wonder if those
functions could be generalized since it's just repeated boilerplate
code in order to fetch allow_interrupt_control or permissive. Anyway,
likely not part of this patch.
Thanks, Roger.
On 03.12.2019 06:41, Marek Marczykowski-Górecki wrote:
> QEMU running in a stubdom needs to be able to set INTX_DISABLE, and the
> MSI(-X) enable flags in the PCI config space. This adds an attribute
> 'allow_interrupt_control' which when set for a PCI device allows writes
> to this flag(s). The toolstack will need to set this for stubdoms.
> When enabled, guest (stubdomain) will be allowed to set relevant enable
> flags, but only one at a time - i.e. it refuses to enable more than one
> of INTx, MSI, MSI-X at a time.
>
> This functionality is needed only for config space access done by device
> model (stubdomain) serving a HVM with the actual PCI device. It is not
> necessary and unsafe to enable direct access to those bits for PV domain
> with the device attached. For PV domains, there are separate protocol
> messages (XEN_PCI_OP_{enable,disable}_{msi,msix}) for this purpose.
> Those ops in addition to setting enable bits, also configure MSI(-X) in
> dom0 kernel - which is undesirable for PCI passthrough to HVM guests.
>
> This should not introduce any new security issues since a malicious
> guest (or stubdom) can already generate MSIs through other ways, see
> [1] page 8.
True, albeit this doesn't cover INTX_DISABLE.
> Additionally, when qemu runs in dom0, it already have direct
> access to those bits.
True again, but any bug here (as in: too wide exposure) is a qemu
bug (possibly security issue). The statement also isn't really
correct for de-privileged qemu, I think (but I also think PCI
pass-through doesn't work in that mode at all yet).
On the whole this looks to be an acceptable approach to me. But
I'm not the maintainer, so my opinion may not count much. Some
issues with the implementation itself were already pointed out.
Jan
On Tue, Dec 03, 2019 at 04:17:33PM +0100, Roger Pau Monné wrote:
> On Tue, Dec 03, 2019 at 06:41:56AM +0100, Marek Marczykowski-Górecki wrote:
> > QEMU running in a stubdom needs to be able to set INTX_DISABLE, and the
> > MSI(-X) enable flags in the PCI config space. This adds an attribute
> > 'allow_interrupt_control' which when set for a PCI device allows writes
> > to this flag(s). The toolstack will need to set this for stubdoms.
> > When enabled, guest (stubdomain) will be allowed to set relevant enable
> > flags, but only one at a time - i.e. it refuses to enable more than one
> > of INTx, MSI, MSI-X at a time.
> >
> > This functionality is needed only for config space access done by device
> > model (stubdomain) serving a HVM with the actual PCI device. It is not
> > necessary and unsafe to enable direct access to those bits for PV domain
> > with the device attached. For PV domains, there are separate protocol
> > messages (XEN_PCI_OP_{enable,disable}_{msi,msix}) for this purpose.
> > Those ops in addition to setting enable bits, also configure MSI(-X) in
> > dom0 kernel - which is undesirable for PCI passthrough to HVM guests.
> >
> > This should not introduce any new security issues since a malicious
> > guest (or stubdom) can already generate MSIs through other ways, see
> > [1] page 8. Additionally, when qemu runs in dom0, it already have direct
> > access to those bits.
> >
> > This is the second iteration of this feature. First was proposed as a
> > direct Xen interface through a new hypercall, but ultimately it was
> > rejected by the maintainer, because of mixing pciback and hypercalls for
> > PCI config space access isn't a good design. Full discussion at [2].
> >
> > [1]: https://invisiblethingslab.com/resources/2011/Software%20Attacks%20on%20Intel%20VT-d.pdf
> > [2]: https://xen.markmail.org/thread/smpgpws4umdzizze
> >
> > [part of the commit message and sysfs handling]
> > Signed-off-by: Simon Gaiser <[email protected]>
> > [the rest]
> > Signed-off-by: Marek Marczykowski-Górecki <[email protected]>
> > ---
> > I'm not very happy about code duplication regarding MSI/MSI-X/INTx
> > exclusivity test, but I don't have better ideas how to structure it. Any
> > suggestions?
>
> Can't you create a helper that returns the currently enabled interrupt
> mode?
>
> I expect returning an enum (ie: NONE, INTX, MSI, MSIX) should be fine
> since no two of those should be enabled at the same time.
Done in v2 (plus ERR member).
>
> > ---
> > .../xen/xen-pciback/conf_space_capability.c | 113 ++++++++++++++++++
> > drivers/xen/xen-pciback/conf_space_header.c | 30 +++++
> > drivers/xen/xen-pciback/pci_stub.c | 66 ++++++++++
> > drivers/xen/xen-pciback/pciback.h | 1 +
> > 4 files changed, 210 insertions(+)
> >
> > diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c
> > index e5694133ebe5..c5a7c58ff3e3 100644
> > --- a/drivers/xen/xen-pciback/conf_space_capability.c
> > +++ b/drivers/xen/xen-pciback/conf_space_capability.c
> > @@ -189,6 +189,109 @@ static const struct config_field caplist_pm[] = {
> > {}
> > };
> >
> > +static struct msi_msix_field_config {
> > + u16 enable_bit; /* bit for enabling MSI/MSI-X */
> > + int other_cap; /* the other capability for exclusiveness check */
>
> Nit: just one space between the declaration and the comment IMO.
>
> Also capability ID is not a signed value, hence unsigned int would
> feel more natural.
Replaced with enum in v2.
> > +} msi_field_config = {
> > + .enable_bit = PCI_MSI_FLAGS_ENABLE,
> > + .other_cap = PCI_CAP_ID_MSIX,
> > +}, msix_field_config = {
> > + .enable_bit = PCI_MSIX_FLAGS_ENABLE,
> > + .other_cap = PCI_CAP_ID_MSI,
> > +};
>
> I think it would be more helpful to store the current capability ID
> rather the one you need to check against. Then if you had a helper
> that returns the currently enabled interrupt mode you would have to
> check that either it's NONE or matches the capability requested to be
> enabled.
Done in v2.
> > +
> > +static void *msi_field_init(struct pci_dev *dev, int offset)
> > +{
> > + return &msi_field_config;
> > +}
> > +
> > +static void *msix_field_init(struct pci_dev *dev, int offset)
> > +{
> > + return &msix_field_config;
> > +}
> > +
> > +static int msi_msix_flags_write(struct pci_dev *dev, int offset, u16 new_value,
> > + void *data)
> > +{
> > + int err;
> > + u16 old_value;
> > + struct msi_msix_field_config *field_config = data;
> > + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
>
> const for both the above.
Done in v2.
> > + int other_cap_offset;
>
> unsigned int
Done in v2.
> > + u16 other_cap_enable_bit;
> > + u16 other_cap_value;
> > +
> > + if (xen_pcibk_permissive || dev_data->permissive)
> > + goto write;
> > +
> > + err = pci_read_config_word(dev, offset, &old_value);
> > + if (err)
> > + return err;
> > +
> > + if (new_value == old_value)
> > + return 0;
> > +
> > + if (!dev_data->allow_interrupt_control ||
> > + (new_value ^ old_value) & ~field_config->enable_bit)
> > + return PCIBIOS_SET_FAILED;
> > +
> > + if (new_value & field_config->enable_bit) {
> > + /* don't allow enabling together with INTx */
> > + err = pci_read_config_word(dev, PCI_COMMAND, &other_cap_value);
> > + if (err)
> > + return err;
> > + if (!(other_cap_value & PCI_COMMAND_INTX_DISABLE))
> > + return PCIBIOS_SET_FAILED;
> > +
> > + /* and the other MSI(-X) */
> > + switch (field_config->other_cap) {
> > + case PCI_CAP_ID_MSI:
> > + other_cap_offset = dev->msi_cap + PCI_MSI_FLAGS;
> > + other_cap_enable_bit = PCI_MSI_FLAGS_ENABLE;
> > + break;
> > + case PCI_CAP_ID_MSIX:
> > + other_cap_offset = dev->msix_cap + PCI_MSIX_FLAGS;
> > + other_cap_enable_bit = PCI_MSIX_FLAGS_ENABLE;
> > + break;
>
> I think you should check whether the other capability exists. I guess
> msi{x}_cap will be 0 if not present?
Done in v2 (and the code is moved into separate function).
> > + default:
> > + BUG_ON(1);
>
> Doesn't Linux have a plain BUG();?
Not needed anymore.
> > + }
> > + err = pci_read_config_word(dev,
> > + other_cap_offset,
> > + &other_cap_value);
> > + if (err)
> > + return err;
> > +
> > + if (other_cap_value & other_cap_enable_bit)
> > + return PCIBIOS_SET_FAILED;
> > + }
> > +
> > +write:
> > + return pci_write_config_word(dev, offset, new_value);
> > +}
> > +
> > +static const struct config_field caplist_msix[] = {
> > + {
> > + .offset = PCI_MSIX_FLAGS,
> > + .size = 2,
> > + .init = msix_field_init,
> > + .u.w.read = xen_pcibk_read_config_word,
> > + .u.w.write = msi_msix_flags_write,
> > + },
> > + {}
> > +};
> > +
> > +static const struct config_field caplist_msi[] = {
> > + {
> > + .offset = PCI_MSI_FLAGS,
> > + .size = 2,
> > + .init = msi_field_init,
> > + .u.w.read = xen_pcibk_read_config_word,
> > + .u.w.write = msi_msix_flags_write,
> > + },
> > + {}
> > +};
> > +
> > static struct xen_pcibk_config_capability xen_pcibk_config_capability_pm = {
> > .capability = PCI_CAP_ID_PM,
> > .fields = caplist_pm,
> > @@ -197,11 +300,21 @@ static struct xen_pcibk_config_capability xen_pcibk_config_capability_vpd = {
> > .capability = PCI_CAP_ID_VPD,
> > .fields = caplist_vpd,
> > };
> > +static struct xen_pcibk_config_capability xen_pcibk_config_capability_msi = {
> > + .capability = PCI_CAP_ID_MSI,
> > + .fields = caplist_msi,
> > +};
> > +static struct xen_pcibk_config_capability xen_pcibk_config_capability_msix = {
> > + .capability = PCI_CAP_ID_MSIX,
> > + .fields = caplist_msix,
> > +};
> >
> > int xen_pcibk_config_capability_init(void)
> > {
> > register_capability(&xen_pcibk_config_capability_vpd);
> > register_capability(&xen_pcibk_config_capability_pm);
> > + register_capability(&xen_pcibk_config_capability_msi);
> > + register_capability(&xen_pcibk_config_capability_msix);
> >
> > return 0;
> > }
> > diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c
> > index 10ae24b5a76e..1e0fff02e21b 100644
> > --- a/drivers/xen/xen-pciback/conf_space_header.c
> > +++ b/drivers/xen/xen-pciback/conf_space_header.c
> > @@ -64,6 +64,7 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
> > int err;
> > u16 val;
> > struct pci_cmd_info *cmd = data;
> > + u16 cap_value;
> >
> > dev_data = pci_get_drvdata(dev);
> > if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
> > @@ -117,6 +118,35 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
> > pci_clear_mwi(dev);
> > }
> >
> > + if (dev_data && dev_data->allow_interrupt_control) {
> > + if (!(cmd->val & PCI_COMMAND_INTX_DISABLE) &&
> > + (value & PCI_COMMAND_INTX_DISABLE)) {
> > + pci_intx(dev, 0);
> > + } else if ((cmd->val & PCI_COMMAND_INTX_DISABLE) &&
> > + !(value & PCI_COMMAND_INTX_DISABLE)) {
> > + /* Do not allow enabling INTx together with MSI or MSI-X. */
> > + /* Do not trust dev->msi(x)_enabled here, as enabling could be done
> > + * bypassing the pci_*msi* functions, by the qemu.
> > + */
> > + err = pci_read_config_word(dev,
> > + dev->msi_cap + PCI_MSI_FLAGS,
> > + &cap_value);
> > + if (!err && (cap_value & PCI_MSI_FLAGS_ENABLE))
> > + err = -EBUSY;
> > + if (!err)
> > + err = pci_read_config_word(dev,
> > + dev->msix_cap + PCI_MSIX_FLAGS,
> > + &cap_value);
> > + if (!err && (cap_value & PCI_MSIX_FLAGS_ENABLE))
> > + err = -EBUSY;
>
> Shouldn't this return PCI-style errors?
>
> I think PCIBIOS_SET_FAILED would be more appropriate here. Note sure
> whether you should terminate the function here in that case, or else
> the error is lost and not returned to the caller, yet the function
> failed at least partially.
Fixed in v2.
> > + if (err)
> > + pr_warn("%s: cannot enable INTx (%d)\n",
>
> This should be ratelimited since it's guest triggerable.
Message removed in favor of not loosing error value.
> > + pci_name(dev), err);
> > + else
> > + pci_intx(dev, 1);
> > + }
> > + }
> > +
> > cmd->val = value;
> >
> > if (!xen_pcibk_permissive && (!dev_data || !dev_data->permissive))
> > diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
> > index 097410a7cdb7..7af93d65ed51 100644
> > --- a/drivers/xen/xen-pciback/pci_stub.c
> > +++ b/drivers/xen/xen-pciback/pci_stub.c
> > @@ -304,6 +304,8 @@ void pcistub_put_pci_dev(struct pci_dev *dev)
> > xen_pcibk_config_reset_dev(dev);
> > xen_pcibk_config_free_dyn_fields(dev);
> >
> > + dev_data->allow_interrupt_control = 0;
>
> Why do you need to do this here? I don't see any other options being
> cleared here (I would expect for example permissive to also be
> cleared if required).
To ensure the option isn't erroneously left enabled after switching the
device from HVM domain (with stubdom) to PV one. This may happen when toolstack only
do write when enabling this option (HVM with stubdom case), but do
not disable it in other cases.
In fact exactly the same issue applies to permissive setting - if you
first boot a guest with:
pci = [ '...,permisive=1' ]
and then with:
pci = [ '...,permisive=0' ]
the permissive mode will stay enabled. Unless you unbind the device from
the driver in the meantime. The easiest method would be xl
pci-assignable-remove && xl pci-assignable-add, but since XSA-302 that
would re-introduce the very same security issue.
> > +
> > xen_unregister_device_domain_owner(dev);
> >
> > spin_lock_irqsave(&found_psdev->lock, flags);
> > @@ -1431,6 +1433,65 @@ static ssize_t permissive_show(struct device_driver *drv, char *buf)
> > }
> > static DRIVER_ATTR_RW(permissive);
> >
> > +static ssize_t allow_interrupt_control_store(struct device_driver *drv,
> > + const char *buf, size_t count)
> > +{
> > + int domain, bus, slot, func;
> > + int err;
> > + struct pcistub_device *psdev;
> > + struct xen_pcibk_dev_data *dev_data;
> > +
> > + err = str_to_slot(buf, &domain, &bus, &slot, &func);
> > + if (err)
> > + goto out;
> > +
> > + psdev = pcistub_device_find(domain, bus, slot, func);
> > + if (!psdev) {
> > + err = -ENODEV;
> > + goto out;
> > + }
> > +
> > + dev_data = pci_get_drvdata(psdev->dev);
> > + /* the driver data for a device should never be null at this point */
> > + if (!dev_data) {
> > + err = -ENXIO;
> > + goto release;
> > + }
> > + dev_data->allow_interrupt_control = 1;
> > +release:
> > + pcistub_device_put(psdev);
> > +out:
> > + if (!err)
> > + err = count;
> > + return err;
> > +}
> > +
> > +static ssize_t allow_interrupt_control_show(struct device_driver *drv,
> > + char *buf)
> > +{
> > + struct pcistub_device *psdev;
> > + struct xen_pcibk_dev_data *dev_data;
> > + size_t count = 0;
> > + unsigned long flags;
> > +
> > + spin_lock_irqsave(&pcistub_devices_lock, flags);
> > + list_for_each_entry(psdev, &pcistub_devices, dev_list) {
> > + if (count >= PAGE_SIZE)
> > + break;
> > + if (!psdev->dev)
> > + continue;
> > + dev_data = pci_get_drvdata(psdev->dev);
> > + if (!dev_data || !dev_data->allow_interrupt_control)
> > + continue;
> > + count +=
> > + scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
> > + pci_name(psdev->dev));
> > + }
> > + spin_unlock_irqrestore(&pcistub_devices_lock, flags);
> > + return count;
> > +}
> > +static DRIVER_ATTR_RW(allow_interrupt_control);
>
> This is mostly a clone of permissive_{store/show}, I wonder if those
> functions could be generalized since it's just repeated boilerplate
> code in order to fetch allow_interrupt_control or permissive. Anyway,
> likely not part of this patch.
That would be more significant change than just refactoring those
functions. permissive and allow_interrupt_control are separate fields in
a structure, so to generalize it I can't simply put it into a function
parameter. It would need to be a macro, or use some callbacks. I don't
like either. I may be missing something, but the best I can think of is
to convert it into some flags field with separate bits for those values.
But this means also changing every place using those values.
--
Best Regards,
Marek Marczykowski-Górecki
Invisible Things Lab
A: Because it messes up the order in which people normally read text.
Q: Why is top-posting such a bad thing?