v2 -> v3:
Saved structure has variable contents.
Avi, see if this adds any credibility to the pci-core allocated
opaque buffer. It was wrong in the previous versions to distill
the variable device capability save list into a fixed struct.
This should also eliminate any future maintenance specific to
this storing and loading of state as capability save changes.
v1 -> v2:
Make the pointer passed around less opaque for type safety.
Bug https://bugs.launchpad.net/qemu/+bug/754591 is caused because
the KVM module attempts to do a pci_save_state() before assigning
the device to a VM, expecting that the saved state will remain
valid until we release the device. This is in conflict with our
need to reset devices using PCI sysfs during a VM reset to
quiesce the device. Any calls to pci_reset_function() will
overwrite the device saved stated prior to reset, and reload and
invalidate the state after. KVM then ends up trying to restore
the state, but it's already invalid, so the device ends up with
reset values.
This series adds a mechanism to pull the saved state off the
struct pci_dev and reload it later. Thanks,
Alex
---
Alex Williamson (3):
KVM: Use pci_store/load_saved_state() around VM device usage
PCI: Add interfaces to store and load the device saved state
PCI: Track the size of each saved capability data area
drivers/pci/pci.c | 110 ++++++++++++++++++++++++++++++++++++++++++++--
include/linux/kvm_host.h | 1
include/linux/pci.h | 15 +++++-
virt/kvm/assigned-dev.c | 18 ++++++--
4 files changed, 132 insertions(+), 12 deletions(-)
This will allow us to store and load it later.
Signed-off-by: Alex Williamson <[email protected]>
---
drivers/pci/pci.c | 12 +++++++-----
include/linux/pci.h | 11 ++++++++---
2 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 2472e71..d2500a0 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -830,7 +830,7 @@ static int pci_save_pcie_state(struct pci_dev *dev)
dev_err(&dev->dev, "buffer not found in %s\n", __func__);
return -ENOMEM;
}
- cap = (u16 *)&save_state->data[0];
+ cap = (u16 *)&save_state->saved.data[0];
pci_read_config_word(dev, pos + PCI_EXP_FLAGS, &flags);
@@ -863,7 +863,7 @@ static void pci_restore_pcie_state(struct pci_dev *dev)
pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
if (!save_state || pos <= 0)
return;
- cap = (u16 *)&save_state->data[0];
+ cap = (u16 *)&save_state->saved.data[0];
pci_read_config_word(dev, pos + PCI_EXP_FLAGS, &flags);
@@ -899,7 +899,8 @@ static int pci_save_pcix_state(struct pci_dev *dev)
return -ENOMEM;
}
- pci_read_config_word(dev, pos + PCI_X_CMD, (u16 *)save_state->data);
+ pci_read_config_word(dev, pos + PCI_X_CMD,
+ (u16 *)save_state->saved.data);
return 0;
}
@@ -914,7 +915,7 @@ static void pci_restore_pcix_state(struct pci_dev *dev)
pos = pci_find_capability(dev, PCI_CAP_ID_PCIX);
if (!save_state || pos <= 0)
return;
- cap = (u16 *)&save_state->data[0];
+ cap = (u16 *)&save_state->saved.data[0];
pci_write_config_word(dev, pos + PCI_X_CMD, cap[i++]);
}
@@ -1771,7 +1772,8 @@ static int pci_add_cap_save_buffer(
if (!save_state)
return -ENOMEM;
- save_state->cap_nr = cap;
+ save_state->saved.cap_nr = cap;
+ save_state->saved.size = size;
pci_add_saved_cap(dev, save_state);
return 0;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 96f70d7..46fd382 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -214,12 +214,17 @@ enum pci_bus_speed {
PCI_SPEED_UNKNOWN = 0xff,
};
-struct pci_cap_saved_state {
- struct hlist_node next;
+struct pci_cap_saved {
char cap_nr;
+ unsigned int size;
u32 data[0];
};
+struct pci_cap_saved_state {
+ struct hlist_node next;
+ struct pci_cap_saved saved;
+};
+
struct pcie_link_state;
struct pci_vpd;
struct pci_sriov;
@@ -366,7 +371,7 @@ static inline struct pci_cap_saved_state *pci_find_saved_cap(
struct hlist_node *pos;
hlist_for_each_entry(tmp, pos, &pci_dev->saved_cap_space, next) {
- if (tmp->cap_nr == cap)
+ if (tmp->saved.cap_nr == cap)
return tmp;
}
return NULL;
For KVM device assignment, we'd like to save off the state of a device
prior to passing it to the guest and restore it later. We also want
to allow pci_reset_funciton() to be called while the device is owned
by the guest. This however overwrites and invalidates the struct pci_dev
buffers, so we can't just manually call save and restore. Add generic
interfaces for the saved state to be stored and reloaded back into
struct pci_dev at a later time.
Signed-off-by: Alex Williamson <[email protected]>
---
drivers/pci/pci.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/pci.h | 4 ++
2 files changed, 102 insertions(+), 0 deletions(-)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index d2500a0..7631acf 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -976,6 +976,104 @@ void pci_restore_state(struct pci_dev *dev)
dev->state_saved = false;
}
+struct pci_saved_state {
+ u32 config_space[16];
+ struct pci_cap_saved cap_saved[0];
+};
+
+/**
+ * pci_store_saved_state - Allocate and return an opaque struct containing
+ * the device saved state.
+ * @dev: PCI device that we're dealing with
+ *
+ * Rerturn NULL if no state or error.
+ */
+struct pci_saved_state *pci_store_saved_state(struct pci_dev *dev)
+{
+ struct pci_saved_state *state;
+ struct pci_cap_saved_state *tmp;
+ struct pci_cap_saved *cap_saved;
+ struct hlist_node *pos;
+ size_t size;
+
+ if (!dev->state_saved)
+ return NULL;
+
+ size = sizeof(*state) + sizeof(struct pci_cap_saved);
+
+ hlist_for_each_entry(tmp, pos, &dev->saved_cap_space, next)
+ size += sizeof(struct pci_cap_saved) + tmp->saved.size;
+
+ state = kzalloc(size, GFP_KERNEL);
+ if (!state)
+ return NULL;
+
+ memcpy(state->config_space, dev->saved_config_space,
+ sizeof(state->config_space));
+
+ cap_saved = state->cap_saved;
+ hlist_for_each_entry(tmp, pos, &dev->saved_cap_space, next) {
+ size_t len = sizeof(struct pci_cap_saved) + tmp->saved.size;
+ memcpy(cap_saved, &tmp->saved, len);
+ cap_saved = (struct pci_cap_saved *)((u8 *)cap_saved + len);
+ }
+ /* Empty cap_save terminates list */
+
+ return state;
+}
+EXPORT_SYMBOL_GPL(pci_store_saved_state);
+
+/**
+ * pci_load_saved_state - Reload the provided save state into struct pci_dev.
+ * @dev: PCI device that we're dealing with
+ * @state: Saved state returned from pci_store_saved_state()
+ */
+int pci_load_saved_state(struct pci_dev *dev, struct pci_saved_state *state)
+{
+ struct pci_cap_saved *cap_saved;
+
+ dev->state_saved = false;
+
+ if (!state)
+ return 0;
+
+ memcpy(dev->saved_config_space, state->config_space,
+ sizeof(state->config_space));
+
+ cap_saved = state->cap_saved;
+ while (cap_saved->size) {
+ struct pci_cap_saved_state *tmp;
+
+ tmp = pci_find_saved_cap(dev, cap_saved->cap_nr);
+ if (!tmp || tmp->saved.size != cap_saved->size)
+ return -EINVAL;
+
+ memcpy(tmp->saved.data, cap_saved->data, tmp->saved.size);
+ cap_saved = (struct pci_cap_saved *)((u8 *)cap_saved +
+ sizeof(struct pci_cap_saved) + cap_saved->size);
+ }
+
+ dev->state_saved = true;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pci_load_saved_state);
+
+/**
+ * pci_load_and_free_saved_state - Reload the save state pointed to by state,
+ * and free the memory allocated for it.
+ * @dev: PCI device that we're dealing with
+ * @state: Pointer to saved state returned from pci_store_saved_state()
+ */
+int pci_load_and_free_saved_state(struct pci_dev *dev,
+ struct pci_saved_state **state)
+{
+ int ret = pci_load_saved_state(dev, *state);
+ kfree(*state);
+ *state = NULL;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pci_load_and_free_saved_state);
+
static int do_pci_enable_device(struct pci_dev *dev, int bars)
{
int err;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 46fd382..f2a6262 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -812,6 +812,10 @@ size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, size_t size);
/* Power management related routines */
int pci_save_state(struct pci_dev *dev);
void pci_restore_state(struct pci_dev *dev);
+struct pci_saved_state *pci_store_saved_state(struct pci_dev *dev);
+int pci_load_saved_state(struct pci_dev *dev, struct pci_saved_state *state);
+int pci_load_and_free_saved_state(struct pci_dev *dev,
+ struct pci_saved_state **state);
int __pci_complete_power_transition(struct pci_dev *dev, pci_power_t state);
int pci_set_power_state(struct pci_dev *dev, pci_power_t state);
pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state);
Store the device saved state so that we can reload the device back
to the original state when it's unassigned. This has the benefit
that the state survives across pci_reset_function() calls via
the PCI sysfs reset interface while the VM is using the device.
Signed-off-by: Alex Williamson <[email protected]>
---
include/linux/kvm_host.h | 1 +
virt/kvm/assigned-dev.c | 18 ++++++++++++++----
2 files changed, 15 insertions(+), 4 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ab42855..9272db0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -513,6 +513,7 @@ struct kvm_assigned_dev_kernel {
struct kvm *kvm;
spinlock_t intx_lock;
char irq_name[32];
+ struct pci_saved_state *pci_saved_state;
};
struct kvm_irq_mask_notifier {
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ae72ae6..6cc4b97 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -197,8 +197,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
{
kvm_free_assigned_irq(kvm, assigned_dev);
- __pci_reset_function(assigned_dev->dev);
- pci_restore_state(assigned_dev->dev);
+ pci_reset_function(assigned_dev->dev);
+ if (pci_load_and_free_saved_state(assigned_dev->dev,
+ &assigned_dev->pci_saved_state))
+ printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
+ __func__, dev_name(&assigned_dev->dev->dev));
+ else
+ pci_restore_state(assigned_dev->dev);
pci_release_regions(assigned_dev->dev);
pci_disable_device(assigned_dev->dev);
@@ -516,7 +521,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
pci_reset_function(dev);
pci_save_state(dev);
-
+ match->pci_saved_state = pci_store_saved_state(dev);
+ if (!match->pci_saved_state)
+ printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
+ __func__, dev_name(&dev->dev));
match->assigned_dev_id = assigned_dev->assigned_dev_id;
match->host_segnr = assigned_dev->segnr;
match->host_busnr = assigned_dev->busnr;
@@ -546,7 +554,9 @@ out:
mutex_unlock(&kvm->lock);
return r;
out_list_del:
- pci_restore_state(dev);
+ if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
+ printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
+ __func__, dev_name(&dev->dev));
list_del(&match->list);
pci_release_regions(dev);
out_disable:
On 04/20/2011 11:31 PM, Alex Williamson wrote:
> v2 -> v3:
> Saved structure has variable contents.
>
> Avi, see if this adds any credibility to the pci-core allocated
> opaque buffer. It was wrong in the previous versions to distill
> the variable device capability save list into a fixed struct.
> This should also eliminate any future maintenance specific to
> this storing and loading of state as capability save changes.
>
Haha, nice trick. Yes, it does.
--
error compiling committee.c: too many arguments to function
On 04/20/2011 11:31 PM, Alex Williamson wrote:
> Store the device saved state so that we can reload the device back
> to the original state when it's unassigned. This has the benefit
> that the state survives across pci_reset_function() calls via
> the PCI sysfs reset interface while the VM is using the device.
Acked-by: Avi Kivity <[email protected]>
--
error compiling committee.c: too many arguments to function
On Wed, 20 Apr 2011 14:31:33 -0600
Alex Williamson <[email protected]> wrote:
> -struct pci_cap_saved_state {
> - struct hlist_node next;
> +struct pci_cap_saved {
> char cap_nr;
> + unsigned int size;
> u32 data[0];
> };
>
> +struct pci_cap_saved_state {
> + struct hlist_node next;
> + struct pci_cap_saved saved;
> +};
> +
> struct pcie_link_state;
> struct pci_vpd;
> struct pci_sriov;
> @@ -366,7 +371,7 @@ static inline struct pci_cap_saved_state *pci_find_saved_cap(
> struct hlist_node *pos;
>
> hlist_for_each_entry(tmp, pos, &pci_dev->saved_cap_space, next) {
> - if (tmp->cap_nr == cap)
> + if (tmp->saved.cap_nr == cap)
> return tmp;
> }
> return NULL;
Looks pretty good in general. But I think the naming makes it harder
to read than it ought to be.
So we have a pci_cap_saved_state, which implies capability info, and
that's fine.
But pci_cap_saved doesn't communicate much; maybe pci_cap_data or
pci_cap_saved_data would be better?
Thanks,
--
Jesse Barnes, Intel Open Source Technology Center