This patchset is to propose a solution of adding live migration
support for SRIOV NIC.
During migration, Qemu needs to let VF driver in the VM to know
migration start and end. Qemu adds faked PCI migration capability
to help to sync status between two sides during migration.
Qemu triggers VF's mailbox irq via sending MSIX msg when migration
status is changed. VF driver tells Qemu its mailbox vector index
via the new PCI capability. In some cases(NIC is suspended or closed),
VF mailbox irq is freed and VF driver can disable irq injecting via
new capability.
VF driver will put down nic before migration and put up again on
the target machine.
Lan Tianyu (3):
VFIO: Add new ioctl cmd VFIO_GET_PCI_CAP_INFO
PCI: Add macros for faked PCI migration capability
Ixgbevf: Add migration support for ixgbevf driver
drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 5 ++
drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 102 ++++++++++++++++++++++
drivers/vfio/pci/vfio_pci.c | 21 +++++
drivers/vfio/pci/vfio_pci_config.c | 38 ++++++--
drivers/vfio/pci/vfio_pci_private.h | 5 ++
include/uapi/linux/pci_regs.h | 18 +++-
include/uapi/linux/vfio.h | 12 +++
7 files changed, 194 insertions(+), 7 deletions(-)
--
1.8.4.rc0.1.g8f6a3e5.dirty
This patch is to add new ioctl cmd VFIO_GET_PCI_CAP_INFO to get
PCI cap table size and get free PCI config space regs according
pos and size.
Qemu will add faked PCI capability for migration and need such
info.
Signed-off-by: Lan Tianyu <[email protected]>
---
drivers/vfio/pci/vfio_pci.c | 21 ++++++++++++++++++++
drivers/vfio/pci/vfio_pci_config.c | 38 +++++++++++++++++++++++++++++++------
drivers/vfio/pci/vfio_pci_private.h | 5 +++++
include/uapi/linux/vfio.h | 12 ++++++++++++
4 files changed, 70 insertions(+), 6 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 69fab0f..2e42de0 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -784,6 +784,27 @@ hot_reset_release:
kfree(groups);
return ret;
+ } else if (cmd == VFIO_GET_PCI_CAP_INFO) {
+ struct vfio_pci_cap_info info;
+ int offset;
+
+ if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+ return -EFAULT;
+
+ switch (info.index) {
+ case VFIO_PCI_CAP_GET_SIZE:
+ info.size = vfio_get_cap_size(vdev, info.cap, info.offset);
+ break;
+ case VFIO_PCI_CAP_GET_FREE_REGION:
+ offset = vfio_find_free_pci_config_reg(vdev,
+ info.offset, info.size);
+ info.offset = offset;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return copy_to_user((void __user *)arg, &info, sizeof(info));
}
return -ENOTTY;
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index ff75ca3..8afbda4 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -841,6 +841,21 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
return pos;
}
+int vfio_find_free_pci_config_reg(struct vfio_pci_device *vdev,
+ int pos, int size)
+{
+ int i, offset = pos;
+
+ for (i = pos; i < PCI_CFG_SPACE_SIZE; i++) {
+ if (vdev->pci_config_map[i] != PCI_CAP_ID_INVALID)
+ offset = i + 1;
+ else if (i - offset + 1 == size)
+ return offset;
+ }
+
+ return 0;
+}
+
static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
int count, struct perm_bits *perm,
int offset, __le32 *val)
@@ -1199,6 +1214,20 @@ static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev,
return ret;
}
+int vfio_get_cap_size(struct vfio_pci_device *vdev, u8 cap, int pos)
+{
+ int len;
+
+ len = pci_cap_length[cap];
+ if (len == 0xFF) { /* Variable length */
+ len = vfio_cap_len(vdev, cap, pos);
+ if (len < 0)
+ return len;
+ }
+
+ return len;
+}
+
static int vfio_cap_init(struct vfio_pci_device *vdev)
{
struct pci_dev *pdev = vdev->pdev;
@@ -1238,12 +1267,9 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
return ret;
if (cap <= PCI_CAP_ID_MAX) {
- len = pci_cap_length[cap];
- if (len == 0xFF) { /* Variable length */
- len = vfio_cap_len(vdev, cap, pos);
- if (len < 0)
- return len;
- }
+ len = vfio_get_cap_size(vdev, cap, pos);
+ if (len < 0)
+ return len;
}
if (!len) {
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index ae0e1b4..91b4f9b 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -89,4 +89,9 @@ extern void vfio_pci_uninit_perm_bits(void);
extern int vfio_config_init(struct vfio_pci_device *vdev);
extern void vfio_config_free(struct vfio_pci_device *vdev);
+extern int vfio_find_free_pci_config_reg(struct vfio_pci_device *vdev,
+ int pos, int size);
+extern int vfio_get_cap_size(struct vfio_pci_device *vdev,
+ u8 cap, int pos);
+
#endif /* VFIO_PCI_PRIVATE_H */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index b57b750..dfa7023 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -495,6 +495,18 @@ struct vfio_eeh_pe_op {
#define VFIO_EEH_PE_OP _IO(VFIO_TYPE, VFIO_BASE + 21)
+#define VFIO_GET_PCI_CAP_INFO _IO(VFIO_TYPE, VFIO_BASE + 22)
+struct vfio_pci_cap_info {
+ __u32 argsz;
+ __u32 flags;
+#define VFIO_PCI_CAP_GET_SIZE (1 << 0)
+#define VFIO_PCI_CAP_GET_FREE_REGION (1 << 1)
+ __u32 index;
+ __u32 offset;
+ __u32 size;
+ __u8 cap;
+};
+
/* ***************************************************************** */
#endif /* _UAPIVFIO_H */
--
1.8.4.rc0.1.g8f6a3e5.dirty
This patch is to extend PCI CAP id for migration cap and
add reg macros. The CAP ID is trial and we may find better one if the
solution is feasible.
*PCI_VF_MIGRATION_CAP
For VF driver to control that triggers mailbox irq or not during migration.
*PCI_VF_MIGRATION_VMM_STATUS
Qemu stores migration status in the reg
*PCI_VF_MIGRATION_VF_STATUS
VF driver tells Qemu ready for migration
*PCI_VF_MIGRATION_IRQ
VF driver stores mailbox interrupt vector in the reg for Qemu to trigger during migration.
Signed-off-by: Lan Tianyu <[email protected]>
---
include/uapi/linux/pci_regs.h | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index efe3443..9defb6f 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -216,7 +216,8 @@
#define PCI_CAP_ID_MSIX 0x11 /* MSI-X */
#define PCI_CAP_ID_SATA 0x12 /* SATA Data/Index Conf. */
#define PCI_CAP_ID_AF 0x13 /* PCI Advanced Features */
-#define PCI_CAP_ID_MAX PCI_CAP_ID_AF
+#define PCI_CAP_ID_MIGRATION 0X14
+#define PCI_CAP_ID_MAX PCI_CAP_ID_MIGRATION
#define PCI_CAP_LIST_NEXT 1 /* Next capability in the list */
#define PCI_CAP_FLAGS 2 /* Capability defined flags (16 bits) */
#define PCI_CAP_SIZEOF 4
@@ -904,4 +905,19 @@
#define PCI_TPH_CAP_ST_SHIFT 16 /* st table shift */
#define PCI_TPH_BASE_SIZEOF 12 /* size with no st table */
+/* Migration*/
+#define PCI_VF_MIGRATION_CAP 0x04
+#define PCI_VF_MIGRATION_VMM_STATUS 0x05
+#define PCI_VF_MIGRATION_VF_STATUS 0x06
+#define PCI_VF_MIGRATION_IRQ 0x07
+
+#define PCI_VF_MIGRATION_DISABLE 0x00
+#define PCI_VF_MIGRATION_ENABLE 0x01
+
+#define VMM_MIGRATION_END 0x00
+#define VMM_MIGRATION_START 0x01
+
+#define PCI_VF_WAIT_FOR_MIGRATION 0x00
+#define PCI_VF_READY_FOR_MIGRATION 0x01
+
#endif /* LINUX_PCI_REGS_H */
--
1.8.4.rc0.1.g8f6a3e5.dirty
This patch is to add migration support for ixgbevf driver. Using
faked PCI migration capability table communicates with Qemu to
share migration status and mailbox irq vector index.
Qemu will notify VF via sending MSIX msg to trigger mailbox
vector during migration and store migration status in the
PCI_VF_MIGRATION_VMM_STATUS regs in the new capability table.
The mailbox irq will be triggered just befoe stop-and-copy stage
and after migration on the target machine.
VF driver will put down net when detect migration and tell
Qemu it's ready for migration via writing PCI_VF_MIGRATION_VF_STATUS
reg. After migration, put up net again.
Qemu will in charge of migrating PCI config space regs and MSIX config.
The patch is to dedicate on the normal case that net traffic works
when mailbox irq is enabled. For other cases(such as the driver
isn't loaded, adapter is suspended or closed), mailbox irq won't be
triggered and VF driver will disable it via PCI_VF_MIGRATION_CAP
reg. These case will be resolved later.
Signed-off-by: Lan Tianyu <[email protected]>
---
drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 5 ++
drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 102 ++++++++++++++++++++++
2 files changed, 107 insertions(+)
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index 775d089..4b8ba2f 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -438,6 +438,11 @@ struct ixgbevf_adapter {
u64 bp_tx_missed;
#endif
+ u8 migration_cap;
+ u8 last_migration_reg;
+ unsigned long migration_status;
+ struct work_struct migration_task;
+
u8 __iomem *io_addr; /* Mainly for iounmap use */
u32 link_speed;
bool link_up;
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index a16d267..95860c2 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -96,6 +96,8 @@ static int debug = -1;
module_param(debug, int, 0);
MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+#define MIGRATION_IN_PROGRESS 0
+
static void ixgbevf_service_event_schedule(struct ixgbevf_adapter *adapter)
{
if (!test_bit(__IXGBEVF_DOWN, &adapter->state) &&
@@ -1262,6 +1264,22 @@ static void ixgbevf_set_itr(struct ixgbevf_q_vector *q_vector)
}
}
+static void ixgbevf_migration_check(struct ixgbevf_adapter *adapter)
+{
+ struct pci_dev *pdev = adapter->pdev;
+ u8 val;
+
+ pci_read_config_byte(pdev,
+ adapter->migration_cap + PCI_VF_MIGRATION_VMM_STATUS,
+ &val);
+
+ if (val != adapter->last_migration_reg) {
+ schedule_work(&adapter->migration_task);
+ adapter->last_migration_reg = val;
+ }
+
+}
+
static irqreturn_t ixgbevf_msix_other(int irq, void *data)
{
struct ixgbevf_adapter *adapter = data;
@@ -1269,6 +1287,7 @@ static irqreturn_t ixgbevf_msix_other(int irq, void *data)
hw->mac.get_link_status = 1;
+ ixgbevf_migration_check(adapter);
ixgbevf_service_event_schedule(adapter);
IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, adapter->eims_other);
@@ -1383,6 +1402,7 @@ out:
static int ixgbevf_request_msix_irqs(struct ixgbevf_adapter *adapter)
{
struct net_device *netdev = adapter->netdev;
+ struct pci_dev *pdev = adapter->pdev;
int q_vectors = adapter->num_msix_vectors - NON_Q_VECTORS;
int vector, err;
int ri = 0, ti = 0;
@@ -1423,6 +1443,12 @@ static int ixgbevf_request_msix_irqs(struct ixgbevf_adapter *adapter)
goto free_queue_irqs;
}
+ if (adapter->migration_cap) {
+ pci_write_config_byte(pdev,
+ adapter->migration_cap + PCI_VF_MIGRATION_IRQ,
+ vector);
+ }
+
return 0;
free_queue_irqs:
@@ -2891,6 +2917,59 @@ static void ixgbevf_watchdog_subtask(struct ixgbevf_adapter *adapter)
ixgbevf_update_stats(adapter);
}
+static void ixgbevf_migration_task(struct work_struct *work)
+{
+ struct ixgbevf_adapter *adapter = container_of(work,
+ struct ixgbevf_adapter,
+ migration_task);
+ struct pci_dev *pdev = adapter->pdev;
+ struct net_device *netdev = adapter->netdev;
+ u8 val;
+
+ if (!test_bit(MIGRATION_IN_PROGRESS, &adapter->migration_status)) {
+ pci_read_config_byte(pdev,
+ adapter->migration_cap + PCI_VF_MIGRATION_VMM_STATUS,
+ &val);
+ if (val != VMM_MIGRATION_START)
+ return;
+
+ pr_info("migration start\n");
+ set_bit(MIGRATION_IN_PROGRESS, &adapter->migration_status);
+ netif_device_detach(netdev);
+
+ if (netif_running(netdev)) {
+ rtnl_lock();
+ ixgbevf_down(adapter);
+ rtnl_unlock();
+ }
+ pci_save_state(pdev);
+
+ /* Tell Qemu VF is ready for migration. */
+ pci_write_config_byte(pdev,
+ adapter->migration_cap + PCI_VF_MIGRATION_VF_STATUS,
+ PCI_VF_READY_FOR_MIGRATION);
+ } else {
+ pci_read_config_byte(pdev,
+ adapter->migration_cap + PCI_VF_MIGRATION_VMM_STATUS,
+ &val);
+ if (val != VMM_MIGRATION_END)
+ return;
+
+ pci_restore_state(pdev);
+
+ if (netif_running(netdev)) {
+ ixgbevf_reset(adapter);
+ ixgbevf_up(adapter);
+ }
+
+ netif_device_attach(netdev);
+
+ clear_bit(MIGRATION_IN_PROGRESS, &adapter->migration_status);
+ pr_info("migration end\n");
+ }
+
+}
+
/**
* ixgbevf_service_task - manages and runs subtasks
* @work: pointer to work_struct containing our data
@@ -3122,6 +3201,7 @@ static int ixgbevf_open(struct net_device *netdev)
{
struct ixgbevf_adapter *adapter = netdev_priv(netdev);
struct ixgbe_hw *hw = &adapter->hw;
+ struct pci_dev *pdev = adapter->pdev;
int err;
/* A previous failure to open the device because of a lack of
@@ -3175,6 +3255,13 @@ static int ixgbevf_open(struct net_device *netdev)
ixgbevf_up_complete(adapter);
+ if (adapter->migration_cap) {
+ pci_write_config_byte(pdev,
+ adapter->migration_cap + PCI_VF_MIGRATION_CAP,
+ PCI_VF_MIGRATION_ENABLE);
+ adapter->last_migration_reg = 0;
+ }
+
return 0;
err_req_irq:
@@ -3204,6 +3291,13 @@ err_setup_reset:
static int ixgbevf_close(struct net_device *netdev)
{
struct ixgbevf_adapter *adapter = netdev_priv(netdev);
+ struct pci_dev *pdev = adapter->pdev;
+
+ if (adapter->migration_cap) {
+ pci_write_config_byte(pdev,
+ adapter->migration_cap + PCI_VF_MIGRATION_CAP,
+ PCI_VF_MIGRATION_DISABLE);
+ }
ixgbevf_down(adapter);
ixgbevf_free_irq(adapter);
@@ -3764,6 +3858,12 @@ static int ixgbevf_suspend(struct pci_dev *pdev, pm_message_t state)
int retval = 0;
#endif
+ if (adapter->migration_cap) {
+ pci_write_config_byte(pdev,
+ adapter->migration_cap + PCI_VF_MIGRATION_CAP,
+ PCI_VF_MIGRATION_DISABLE);
+ }
+
netif_device_detach(netdev);
if (netif_running(netdev)) {
@@ -4029,6 +4129,7 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
(unsigned long)adapter);
INIT_WORK(&adapter->service_task, ixgbevf_service_task);
+ INIT_WORK(&adapter->migration_task, ixgbevf_migration_task);
set_bit(__IXGBEVF_SERVICE_INITED, &adapter->state);
clear_bit(__IXGBEVF_SERVICE_SCHED, &adapter->state);
@@ -4064,6 +4165,7 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
break;
}
+ adapter->migration_cap = pci_find_capability(pdev, PCI_CAP_ID_MIGRATION);
return 0;
err_register:
--
1.8.4.rc0.1.g8f6a3e5.dirty
On 11/24/2015 05:38 AM, Lan Tianyu wrote:
> This patchset is to propose a solution of adding live migration
> support for SRIOV NIC.
>
> During migration, Qemu needs to let VF driver in the VM to know
> migration start and end. Qemu adds faked PCI migration capability
> to help to sync status between two sides during migration.
>
> Qemu triggers VF's mailbox irq via sending MSIX msg when migration
> status is changed. VF driver tells Qemu its mailbox vector index
> via the new PCI capability. In some cases(NIC is suspended or closed),
> VF mailbox irq is freed and VF driver can disable irq injecting via
> new capability.
>
> VF driver will put down nic before migration and put up again on
> the target machine.
>
> Lan Tianyu (3):
> VFIO: Add new ioctl cmd VFIO_GET_PCI_CAP_INFO
> PCI: Add macros for faked PCI migration capability
> Ixgbevf: Add migration support for ixgbevf driver
>
> drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 5 ++
> drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 102 ++++++++++++++++++++++
> drivers/vfio/pci/vfio_pci.c | 21 +++++
> drivers/vfio/pci/vfio_pci_config.c | 38 ++++++--
> drivers/vfio/pci/vfio_pci_private.h | 5 ++
> include/uapi/linux/pci_regs.h | 18 +++-
> include/uapi/linux/vfio.h | 12 +++
> 7 files changed, 194 insertions(+), 7 deletions(-)
I'm still not a fan of this approach. I really feel like this is
something that should be resolved by extending the existing PCI hot-plug
rather than trying to instrument this per driver. Then you will get the
goodness for multiple drivers and multiple OSes instead of just one. An
added advantage to dealing with this in the PCI hot-plug environment
would be that you could then still do a hot-plug even if the guest
didn't load a driver for the VF since you would be working with the PCI
slot instead of the device itself.
- Alex
On Tue, Nov 24, 2015 at 09:38:18PM +0800, Lan Tianyu wrote:
> This patch is to add migration support for ixgbevf driver. Using
> faked PCI migration capability table communicates with Qemu to
> share migration status and mailbox irq vector index.
>
> Qemu will notify VF via sending MSIX msg to trigger mailbox
> vector during migration and store migration status in the
> PCI_VF_MIGRATION_VMM_STATUS regs in the new capability table.
> The mailbox irq will be triggered just befoe stop-and-copy stage
> and after migration on the target machine.
>
> VF driver will put down net when detect migration and tell
> Qemu it's ready for migration via writing PCI_VF_MIGRATION_VF_STATUS
> reg. After migration, put up net again.
>
> Qemu will in charge of migrating PCI config space regs and MSIX config.
>
> The patch is to dedicate on the normal case that net traffic works
> when mailbox irq is enabled. For other cases(such as the driver
> isn't loaded, adapter is suspended or closed), mailbox irq won't be
> triggered and VF driver will disable it via PCI_VF_MIGRATION_CAP
> reg. These case will be resolved later.
>
> Signed-off-by: Lan Tianyu <[email protected]>
I have to say, I was much more interested in the idea
of tracking dirty memory. I have some thoughts about
that one - did you give up on it then?
> ---
> drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 5 ++
> drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 102 ++++++++++++++++++++++
> 2 files changed, 107 insertions(+)
>
> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
> index 775d089..4b8ba2f 100644
> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
> @@ -438,6 +438,11 @@ struct ixgbevf_adapter {
> u64 bp_tx_missed;
> #endif
>
> + u8 migration_cap;
> + u8 last_migration_reg;
> + unsigned long migration_status;
> + struct work_struct migration_task;
> +
> u8 __iomem *io_addr; /* Mainly for iounmap use */
> u32 link_speed;
> bool link_up;
> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> index a16d267..95860c2 100644
> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> @@ -96,6 +96,8 @@ static int debug = -1;
> module_param(debug, int, 0);
> MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
>
> +#define MIGRATION_IN_PROGRESS 0
> +
> static void ixgbevf_service_event_schedule(struct ixgbevf_adapter *adapter)
> {
> if (!test_bit(__IXGBEVF_DOWN, &adapter->state) &&
> @@ -1262,6 +1264,22 @@ static void ixgbevf_set_itr(struct ixgbevf_q_vector *q_vector)
> }
> }
>
> +static void ixgbevf_migration_check(struct ixgbevf_adapter *adapter)
> +{
> + struct pci_dev *pdev = adapter->pdev;
> + u8 val;
> +
> + pci_read_config_byte(pdev,
> + adapter->migration_cap + PCI_VF_MIGRATION_VMM_STATUS,
> + &val);
> +
> + if (val != adapter->last_migration_reg) {
> + schedule_work(&adapter->migration_task);
> + adapter->last_migration_reg = val;
> + }
> +
> +}
> +
> static irqreturn_t ixgbevf_msix_other(int irq, void *data)
> {
> struct ixgbevf_adapter *adapter = data;
> @@ -1269,6 +1287,7 @@ static irqreturn_t ixgbevf_msix_other(int irq, void *data)
>
> hw->mac.get_link_status = 1;
>
> + ixgbevf_migration_check(adapter);
> ixgbevf_service_event_schedule(adapter);
>
> IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, adapter->eims_other);
> @@ -1383,6 +1402,7 @@ out:
> static int ixgbevf_request_msix_irqs(struct ixgbevf_adapter *adapter)
> {
> struct net_device *netdev = adapter->netdev;
> + struct pci_dev *pdev = adapter->pdev;
> int q_vectors = adapter->num_msix_vectors - NON_Q_VECTORS;
> int vector, err;
> int ri = 0, ti = 0;
> @@ -1423,6 +1443,12 @@ static int ixgbevf_request_msix_irqs(struct ixgbevf_adapter *adapter)
> goto free_queue_irqs;
> }
>
> + if (adapter->migration_cap) {
> + pci_write_config_byte(pdev,
> + adapter->migration_cap + PCI_VF_MIGRATION_IRQ,
> + vector);
> + }
> +
> return 0;
>
> free_queue_irqs:
> @@ -2891,6 +2917,59 @@ static void ixgbevf_watchdog_subtask(struct ixgbevf_adapter *adapter)
> ixgbevf_update_stats(adapter);
> }
>
> +static void ixgbevf_migration_task(struct work_struct *work)
> +{
> + struct ixgbevf_adapter *adapter = container_of(work,
> + struct ixgbevf_adapter,
> + migration_task);
> + struct pci_dev *pdev = adapter->pdev;
> + struct net_device *netdev = adapter->netdev;
> + u8 val;
> +
> + if (!test_bit(MIGRATION_IN_PROGRESS, &adapter->migration_status)) {
> + pci_read_config_byte(pdev,
> + adapter->migration_cap + PCI_VF_MIGRATION_VMM_STATUS,
> + &val);
> + if (val != VMM_MIGRATION_START)
> + return;
> +
> + pr_info("migration start\n");
> + set_bit(MIGRATION_IN_PROGRESS, &adapter->migration_status);
> + netif_device_detach(netdev);
> +
> + if (netif_running(netdev)) {
> + rtnl_lock();
> + ixgbevf_down(adapter);
> + rtnl_unlock();
> + }
> + pci_save_state(pdev);
> +
> + /* Tell Qemu VF is ready for migration. */
> + pci_write_config_byte(pdev,
> + adapter->migration_cap + PCI_VF_MIGRATION_VF_STATUS,
> + PCI_VF_READY_FOR_MIGRATION);
> + } else {
> + pci_read_config_byte(pdev,
> + adapter->migration_cap + PCI_VF_MIGRATION_VMM_STATUS,
> + &val);
> + if (val != VMM_MIGRATION_END)
> + return;
> +
> + pci_restore_state(pdev);
> +
> + if (netif_running(netdev)) {
> + ixgbevf_reset(adapter);
> + ixgbevf_up(adapter);
> + }
> +
> + netif_device_attach(netdev);
> +
> + clear_bit(MIGRATION_IN_PROGRESS, &adapter->migration_status);
> + pr_info("migration end\n");
> + }
> +
> +}
> +
> /**
> * ixgbevf_service_task - manages and runs subtasks
> * @work: pointer to work_struct containing our data
> @@ -3122,6 +3201,7 @@ static int ixgbevf_open(struct net_device *netdev)
> {
> struct ixgbevf_adapter *adapter = netdev_priv(netdev);
> struct ixgbe_hw *hw = &adapter->hw;
> + struct pci_dev *pdev = adapter->pdev;
> int err;
>
> /* A previous failure to open the device because of a lack of
> @@ -3175,6 +3255,13 @@ static int ixgbevf_open(struct net_device *netdev)
>
> ixgbevf_up_complete(adapter);
>
> + if (adapter->migration_cap) {
> + pci_write_config_byte(pdev,
> + adapter->migration_cap + PCI_VF_MIGRATION_CAP,
> + PCI_VF_MIGRATION_ENABLE);
> + adapter->last_migration_reg = 0;
> + }
> +
> return 0;
>
> err_req_irq:
> @@ -3204,6 +3291,13 @@ err_setup_reset:
> static int ixgbevf_close(struct net_device *netdev)
> {
> struct ixgbevf_adapter *adapter = netdev_priv(netdev);
> + struct pci_dev *pdev = adapter->pdev;
> +
> + if (adapter->migration_cap) {
> + pci_write_config_byte(pdev,
> + adapter->migration_cap + PCI_VF_MIGRATION_CAP,
> + PCI_VF_MIGRATION_DISABLE);
> + }
>
> ixgbevf_down(adapter);
> ixgbevf_free_irq(adapter);
> @@ -3764,6 +3858,12 @@ static int ixgbevf_suspend(struct pci_dev *pdev, pm_message_t state)
> int retval = 0;
> #endif
>
> + if (adapter->migration_cap) {
> + pci_write_config_byte(pdev,
> + adapter->migration_cap + PCI_VF_MIGRATION_CAP,
> + PCI_VF_MIGRATION_DISABLE);
> + }
> +
> netif_device_detach(netdev);
>
> if (netif_running(netdev)) {
> @@ -4029,6 +4129,7 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
> (unsigned long)adapter);
>
> INIT_WORK(&adapter->service_task, ixgbevf_service_task);
> + INIT_WORK(&adapter->migration_task, ixgbevf_migration_task);
> set_bit(__IXGBEVF_SERVICE_INITED, &adapter->state);
> clear_bit(__IXGBEVF_SERVICE_SCHED, &adapter->state);
>
> @@ -4064,6 +4165,7 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
> break;
> }
>
> + adapter->migration_cap = pci_find_capability(pdev, PCI_CAP_ID_MIGRATION);
> return 0;
>
> err_register:
> --
> 1.8.4.rc0.1.g8f6a3e5.dirty
On 2015年11月24日 22:20, Alexander Duyck wrote:
> I'm still not a fan of this approach. I really feel like this is
> something that should be resolved by extending the existing PCI hot-plug
> rather than trying to instrument this per driver. Then you will get the
> goodness for multiple drivers and multiple OSes instead of just one. An
> added advantage to dealing with this in the PCI hot-plug environment
> would be that you could then still do a hot-plug even if the guest
> didn't load a driver for the VF since you would be working with the PCI
> slot instead of the device itself.
>
> - Alex
Hi Alex:
What's you mentioned seems the bonding driver solution.
Paper "Live Migration with Pass-through Device for Linux VM" describes
it. It does VF hotplug during migration. In order to maintain Network
connection when VF is out, it takes advantage of Linux bonding driver to
switch between VF NIC and emulated NIC. But the side affects, that
requires VM to do additional configure and the performance during
switching two NIC is not good.
--
Best regards
Tianyu Lan
On Tue, Nov 24, 2015 at 7:18 PM, Lan Tianyu <[email protected]> wrote:
> On 2015年11月24日 22:20, Alexander Duyck wrote:
>> I'm still not a fan of this approach. I really feel like this is
>> something that should be resolved by extending the existing PCI hot-plug
>> rather than trying to instrument this per driver. Then you will get the
>> goodness for multiple drivers and multiple OSes instead of just one. An
>> added advantage to dealing with this in the PCI hot-plug environment
>> would be that you could then still do a hot-plug even if the guest
>> didn't load a driver for the VF since you would be working with the PCI
>> slot instead of the device itself.
>>
>> - Alex
>
> Hi Alex:
> What's you mentioned seems the bonding driver solution.
> Paper "Live Migration with Pass-through Device for Linux VM" describes
> it. It does VF hotplug during migration. In order to maintain Network
> connection when VF is out, it takes advantage of Linux bonding driver to
> switch between VF NIC and emulated NIC. But the side affects, that
> requires VM to do additional configure and the performance during
> switching two NIC is not good.
No, what I am getting at is that you can't go around and modify the
configuration space for every possible device out there. This
solution won't scale. If you instead moved the logic for notifying
the device into a separate mechanism such as making it a part of the
hot-plug logic then you only have to write the code once per OS in
order to get the hot-plug capability to pause/resume the device. What
I am talking about is not full hot-plug, but rather to extend the
existing hot-plug in Qemu and the Linux kernel to support a
"pause/resume" functionality. The PCI hot-plug specification calls
out the option of implementing something like this, but we don't
currently have support for it.
I just feel doing it through PCI hot-plug messages will scale much
better as you could likely make use of the power management
suspend/resume calls to take care of most of the needed implementation
details.
- Alex
On Tue, Nov 24, 2015 at 1:20 PM, Michael S. Tsirkin <[email protected]> wrote:
> On Tue, Nov 24, 2015 at 09:38:18PM +0800, Lan Tianyu wrote:
>> This patch is to add migration support for ixgbevf driver. Using
>> faked PCI migration capability table communicates with Qemu to
>> share migration status and mailbox irq vector index.
>>
>> Qemu will notify VF via sending MSIX msg to trigger mailbox
>> vector during migration and store migration status in the
>> PCI_VF_MIGRATION_VMM_STATUS regs in the new capability table.
>> The mailbox irq will be triggered just befoe stop-and-copy stage
>> and after migration on the target machine.
>>
>> VF driver will put down net when detect migration and tell
>> Qemu it's ready for migration via writing PCI_VF_MIGRATION_VF_STATUS
>> reg. After migration, put up net again.
>>
>> Qemu will in charge of migrating PCI config space regs and MSIX config.
>>
>> The patch is to dedicate on the normal case that net traffic works
>> when mailbox irq is enabled. For other cases(such as the driver
>> isn't loaded, adapter is suspended or closed), mailbox irq won't be
>> triggered and VF driver will disable it via PCI_VF_MIGRATION_CAP
>> reg. These case will be resolved later.
>>
>> Signed-off-by: Lan Tianyu <[email protected]>
>
> I have to say, I was much more interested in the idea
> of tracking dirty memory. I have some thoughts about
> that one - did you give up on it then?
The tracking of dirty pages still needs to be addressed unless the
interface is being downed before migration even starts which based on
other comments I am assuming is not the case.
I still feel that having a means of marking a page as being dirty when
it is unmapped would be the best way to go. That way you only have to
update the DMA API instead of messing with each and every driver
trying to add code to force the page to be dirtied.
- Alex
On 2015年11月25日 05:20, Michael S. Tsirkin wrote:
> I have to say, I was much more interested in the idea
> of tracking dirty memory. I have some thoughts about
> that one - did you give up on it then?
No, our finial target is to keep VF active before doing
migration and tracking dirty memory is essential. But this
seems not easy to do that in short term for upstream. As
starters, stop VF before migration.
After deep thinking, the way of stopping VF still needs tracking
DMA-accessed dirty memory to make sure the received data buffer
before stopping VF migrated. It's easier to do that via dummy writing
data buffer when receive packet.
--
Best regards
Tianyu Lan
On 2015年11月25日 13:30, Alexander Duyck wrote:
> No, what I am getting at is that you can't go around and modify the
> configuration space for every possible device out there. This
> solution won't scale.
PCI config space regs are emulation by Qemu and so We can find the free
PCI config space regs for the faked PCI capability. Its position can be
not permanent.
> If you instead moved the logic for notifying
> the device into a separate mechanism such as making it a part of the
> hot-plug logic then you only have to write the code once per OS in
> order to get the hot-plug capability to pause/resume the device. What
> I am talking about is not full hot-plug, but rather to extend the
> existing hot-plug in Qemu and the Linux kernel to support a
> "pause/resume" functionality. The PCI hot-plug specification calls
> out the option of implementing something like this, but we don't
> currently have support for it.
>
Could you elaborate the part of PCI hot-plug specification you mentioned?
My concern is whether it needs to change PCI spec or not.
> I just feel doing it through PCI hot-plug messages will scale much
> better as you could likely make use of the power management
> suspend/resume calls to take care of most of the needed implementation
> details.
>
> - Alex
--
Best regards
Tianyu Lan
On Wed, Nov 25, 2015 at 01:39:32PM +0800, Lan Tianyu wrote:
> On 2015年11月25日 05:20, Michael S. Tsirkin wrote:
> > I have to say, I was much more interested in the idea
> > of tracking dirty memory. I have some thoughts about
> > that one - did you give up on it then?
>
> No, our finial target is to keep VF active before doing
> migration and tracking dirty memory is essential. But this
> seems not easy to do that in short term for upstream. As
> starters, stop VF before migration.
Frankly, I don't really see what this short term hack buys us,
and if it goes in, we'll have to maintain it forever.
Also, assuming you just want to do ifdown/ifup for some reason, it's
easy enough to do using a guest agent, in a completely generic way.
> After deep thinking, the way of stopping VF still needs tracking
> DMA-accessed dirty memory to make sure the received data buffer
> before stopping VF migrated. It's easier to do that via dummy writing
> data buffer when receive packet.
>
>
> --
> Best regards
> Tianyu Lan
On Wed, Nov 25, 2015 at 12:21 AM, Lan Tianyu <[email protected]> wrote:
> On 2015年11月25日 13:30, Alexander Duyck wrote:
>> No, what I am getting at is that you can't go around and modify the
>> configuration space for every possible device out there. This
>> solution won't scale.
>
>
> PCI config space regs are emulation by Qemu and so We can find the free
> PCI config space regs for the faked PCI capability. Its position can be
> not permanent.
Yes, but do you really want to edit every driver on every OS that you
plan to support this on. What about things like direct assignment of
regular Ethernet ports? What you really need is a solution that will
work generically on any existing piece of hardware out there.
>> If you instead moved the logic for notifying
>> the device into a separate mechanism such as making it a part of the
>> hot-plug logic then you only have to write the code once per OS in
>> order to get the hot-plug capability to pause/resume the device. What
>> I am talking about is not full hot-plug, but rather to extend the
>> existing hot-plug in Qemu and the Linux kernel to support a
>> "pause/resume" functionality. The PCI hot-plug specification calls
>> out the option of implementing something like this, but we don't
>> currently have support for it.
>>
>
> Could you elaborate the part of PCI hot-plug specification you mentioned?
>
> My concern is whether it needs to change PCI spec or not.
In the PCI Hot-Plug Specification 1.1, in section 4.1.2 it states:
In addition to quiescing add-in card activity, an operating-system
vendor may optionally implement a less drastic “pause” capability, in
anticipation of the same or a similar add-in card being reinserted.
The idea I had was basically if we were to implement something like
that in Linux then we could pause/resume the device instead of
outright removing it. The pause functionality could make use of the
suspend/resume functionality most drivers already have for PCI power
management.
- Alex
On 11/25/2015 8:28 PM, Michael S. Tsirkin wrote:
> Frankly, I don't really see what this short term hack buys us,
> and if it goes in, we'll have to maintain it forever.
>
The framework of how to notify VF about migration status won't be
changed regardless of stopping VF or not before doing migration.
We hope to reach agreement on this first. Tracking dirty memory still
need to more discussions and we will continue working on it. Stop VF may
help to work around the issue and make tracking easier.
> Also, assuming you just want to do ifdown/ifup for some reason, it's
> easy enough to do using a guest agent, in a completely generic way.
>
Just ifdown/ifup is not enough for migration. It needs to restore some
PCI settings before doing ifup on the target machine
On Thu, Nov 26, 2015 at 12:02:33AM +0800, Lan, Tianyu wrote:
> On 11/25/2015 8:28 PM, Michael S. Tsirkin wrote:
> >Frankly, I don't really see what this short term hack buys us,
> >and if it goes in, we'll have to maintain it forever.
> >
>
> The framework of how to notify VF about migration status won't be
> changed regardless of stopping VF or not before doing migration.
> We hope to reach agreement on this first.
Well it's bi-directional, the framework won't work if it's
uni-directional.
Further, if you use this interface to stop the interface
at the moment, you won't be able to do anything else
with it, and will need a new one down the road.
> Tracking dirty memory still
> need to more discussions and we will continue working on it. Stop VF may
> help to work around the issue and make tracking easier.
>
>
> >Also, assuming you just want to do ifdown/ifup for some reason, it's
> >easy enough to do using a guest agent, in a completely generic way.
> >
>
> Just ifdown/ifup is not enough for migration. It needs to restore some PCI
> settings before doing ifup on the target machine
I'd focus on just restoring then.
--
MST
On Wed, Nov 25, 2015 at 8:02 AM, Lan, Tianyu <[email protected]> wrote:
> On 11/25/2015 8:28 PM, Michael S. Tsirkin wrote:
>>
>> Frankly, I don't really see what this short term hack buys us,
>> and if it goes in, we'll have to maintain it forever.
>>
>
> The framework of how to notify VF about migration status won't be
> changed regardless of stopping VF or not before doing migration.
> We hope to reach agreement on this first. Tracking dirty memory still
> need to more discussions and we will continue working on it. Stop VF may
> help to work around the issue and make tracking easier.
The problem is you still have to stop the device at some point for the
same reason why you have to halt the VM. You seem to think you can
get by without doing that but you can't. All you do is open the
system up to multiple races if you leave the device running. The goal
should be to avoid stopping the device until the last possible moment,
however it will still have to be stopped eventually. It isn't as if
you can migrate memory and leave the device doing DMA and expect to
get a clean state.
I agree with Michael. The focus needs to be on first addressing dirty
page tracking. Once you have that you could use a variation on the
bonding solution where you postpone the hot-plug event until near the
end of the migration just before you halt the guest instead of having
to do it before you start the migration. Then after that we could
look at optimizing things further by introducing a variation that you
could further improve on things by introducing a variation of hot-plug
that would pause the device as I suggested instead of removing it. At
that point you should be able to have almost all of the key issues
addresses so that you could drop the bond interface entirely.
>> Also, assuming you just want to do ifdown/ifup for some reason, it's
>> easy enough to do using a guest agent, in a completely generic way.
>>
>
> Just ifdown/ifup is not enough for migration. It needs to restore some PCI
> settings before doing ifup on the target machine
That is why I have been suggesting making use of suspend/resume logic
that is already in place for PCI power management. In the case of a
suspend/resume we already have to deal with the fact that the device
will go through a D0->D3->D0 reset so we have to restore all of the
existing state. It would take a significant load off of Qemu since
the guest would be restoring its own state instead of making Qemu have
to do all of the device migration work.
On Wed, Nov 25, 2015 at 08:24:38AM -0800, Alexander Duyck wrote:
> >> Also, assuming you just want to do ifdown/ifup for some reason, it's
> >> easy enough to do using a guest agent, in a completely generic way.
> >>
> >
> > Just ifdown/ifup is not enough for migration. It needs to restore some PCI
> > settings before doing ifup on the target machine
>
> That is why I have been suggesting making use of suspend/resume logic
> that is already in place for PCI power management. In the case of a
> suspend/resume we already have to deal with the fact that the device
> will go through a D0->D3->D0 reset so we have to restore all of the
> existing state. It would take a significant load off of Qemu since
> the guest would be restoring its own state instead of making Qemu have
> to do all of the device migration work.
That can work, though again, the issue is you need guest
cooperation to migrate.
If you reset device on destination instead of restoring state,
then that issue goes away, but maybe the downtime
will be increased.
Will it really? I think it's worth it to start with the
simplest solution (reset on destination) and see
what the effect is, then add optimizations.
One thing that I've been thinking about for a while, is saving (some)
state speculatively. For example, notify guest a bit before migration
is done, so it can save device state. If guest responds quickly, you
have state that can be restored. If it doesn't, still migrate, and it
will have to reset on destination.
--
MST
On Wed, Nov 25, 2015 at 8:39 AM, Michael S. Tsirkin <[email protected]> wrote:
> On Wed, Nov 25, 2015 at 08:24:38AM -0800, Alexander Duyck wrote:
>> >> Also, assuming you just want to do ifdown/ifup for some reason, it's
>> >> easy enough to do using a guest agent, in a completely generic way.
>> >>
>> >
>> > Just ifdown/ifup is not enough for migration. It needs to restore some PCI
>> > settings before doing ifup on the target machine
>>
>> That is why I have been suggesting making use of suspend/resume logic
>> that is already in place for PCI power management. In the case of a
>> suspend/resume we already have to deal with the fact that the device
>> will go through a D0->D3->D0 reset so we have to restore all of the
>> existing state. It would take a significant load off of Qemu since
>> the guest would be restoring its own state instead of making Qemu have
>> to do all of the device migration work.
>
> That can work, though again, the issue is you need guest
> cooperation to migrate.
Right now the problem is you need to have guest cooperation anyway as
you need to have some way of tracking the dirty pages. If the IOMMU
on the host were to provide some sort of dirty page tracking then we
could exclude the guest from the equation, but until then we need the
guest to notify us of what pages it is letting the device dirty. I'm
still of the opinion that the best way to go there is to just modify
the DMA API that is used in the guest so that it supports some sort of
page flag modification or something along those lines so we can track
all of the pages that might be written to by the device.
> If you reset device on destination instead of restoring state,
> then that issue goes away, but maybe the downtime
> will be increased.
Yes, the downtime will be increased, but it shouldn't be by much.
Depending on the setup a VF with a single queue can have about 3MB of
data outstanding when you move the driver over. After that it is just
a matter of bringing the interface back up which should take only a
few hundred milliseconds assuming the PF is fairly responsive.
> Will it really? I think it's worth it to start with the
> simplest solution (reset on destination) and see
> what the effect is, then add optimizations.
Agreed. My thought would be to start with something like
dma_mark_clean() that could be used to take care of marking the pages
for migration when they are unmapped or synced.
> One thing that I've been thinking about for a while, is saving (some)
> state speculatively. For example, notify guest a bit before migration
> is done, so it can save device state. If guest responds quickly, you
> have state that can be restored. If it doesn't, still migrate, and it
> will have to reset on destination.
I'm not sure how much more device state we really need to save. The
driver in the guest has to have enough state to recover in the event
of a device failure resulting in a slot reset. To top it off the
driver is able to reconfigure things probably as quick as we could if
we were restoring the state.
> On Wed, Nov 25, 2015 at 12:21 AM, Lan Tianyu <[email protected]> wrote:
> > On 2015年11月25日 13:30, Alexander Duyck wrote:
> >> No, what I am getting at is that you can't go around and modify the
> >> configuration space for every possible device out there. This
> >> solution won't scale.
> >
> >
> > PCI config space regs are emulation by Qemu and so We can find the
> > free PCI config space regs for the faked PCI capability. Its position
> > can be not permanent.
>
> Yes, but do you really want to edit every driver on every OS that you plan to
> support this on. What about things like direct assignment of regular Ethernet
> ports? What you really need is a solution that will work generically on any
> existing piece of hardware out there.
The fundamental assumption of this patch series is to modify the driver in guest to self-emulate or track the device state, so that the migration may be possible.
I don't think we can modify OS, without modifying the drivers, even using the PCIe hotplug mechanism.
In the meantime, modifying Windows OS is a big challenge given that only Microsoft can do. While, modifying driver is relatively simple and manageable to device vendors, if the device vendor want to support state-clone based migration.
Thx Eddie
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
On Wed, Nov 25, 2015 at 7:15 PM, Dong, Eddie <[email protected]> wrote:
>> On Wed, Nov 25, 2015 at 12:21 AM, Lan Tianyu <[email protected]> wrote:
>> > On 2015年11月25日 13:30, Alexander Duyck wrote:
>> >> No, what I am getting at is that you can't go around and modify the
>> >> configuration space for every possible device out there. This
>> >> solution won't scale.
>> >
>> >
>> > PCI config space regs are emulation by Qemu and so We can find the
>> > free PCI config space regs for the faked PCI capability. Its position
>> > can be not permanent.
>>
>> Yes, but do you really want to edit every driver on every OS that you plan to
>> support this on. What about things like direct assignment of regular Ethernet
>> ports? What you really need is a solution that will work generically on any
>> existing piece of hardware out there.
>
> The fundamental assumption of this patch series is to modify the driver in guest to self-emulate or track the device state, so that the migration may be possible.
> I don't think we can modify OS, without modifying the drivers, even using the PCIe hotplug mechanism.
> In the meantime, modifying Windows OS is a big challenge given that only Microsoft can do. While, modifying driver is relatively simple and manageable to device vendors, if the device vendor want to support state-clone based migration.
The problem is the code you are presenting, even as a proof of concept
is seriously flawed. It does a poor job of exposing how any of this
can be duplicated for any other VF other than the one you are working
on.
I am not saying you cannot modify the drivers, however what you are
doing is far too invasive. Do you seriously plan on modifying all of
the PCI device drivers out there in order to allow any device that
might be direct assigned to a port to support migration? I certainly
hope not. That is why I have said that this solution will not scale.
What I am counter proposing seems like a very simple proposition. It
can be implemented in two steps.
1. Look at modifying dma_mark_clean(). It is a function called in
the sync and unmap paths of the lib/swiotlb.c. If you could somehow
modify it to take care of marking the pages you unmap for Rx as being
dirty it will get you a good way towards your goal as it will allow
you to continue to do DMA while you are migrating the VM.
2. Look at making use of the existing PCI suspend/resume calls that
are there to support PCI power management. They have everything
needed to allow you to pause and resume DMA for the device before and
after the migration while retaining the driver state. If you can
implement something that allows you to trigger these calls from the
PCI subsystem such as hot-plug then you would have a generic solution
that can be easily reproduced for multiple drivers beyond those
supported by ixgbevf.
Thanks.
- Alex
On 11/26/2015 11:56 AM, Alexander Duyck wrote:
> > I am not saying you cannot modify the drivers, however what you are
> doing is far too invasive. Do you seriously plan on modifying all of
> the PCI device drivers out there in order to allow any device that
> might be direct assigned to a port to support migration? I certainly
> hope not. That is why I have said that this solution will not scale.
Current drivers are not migration friendly. If the driver wants to
support migration, it's necessary to be changed.
RFC PATCH V1 presented our ideas about how to deal with MMIO, ring and
DMA tracking during migration. These are common for most drivers and
they maybe problematic in the previous version but can be corrected later.
Doing suspend and resume() may help to do migration easily but some
devices requires low service down time. Especially network and I got
that some cloud company promised less than 500ms network service downtime.
So I think performance effect also should be taken into account when we
design the framework.
>
> What I am counter proposing seems like a very simple proposition. It
> can be implemented in two steps.
>
> 1. Look at modifying dma_mark_clean(). It is a function called in
> the sync and unmap paths of the lib/swiotlb.c. If you could somehow
> modify it to take care of marking the pages you unmap for Rx as being
> dirty it will get you a good way towards your goal as it will allow
> you to continue to do DMA while you are migrating the VM.
>
> 2. Look at making use of the existing PCI suspend/resume calls that
> are there to support PCI power management. They have everything
> needed to allow you to pause and resume DMA for the device before and
> after the migration while retaining the driver state. If you can
> implement something that allows you to trigger these calls from the
> PCI subsystem such as hot-plug then you would have a generic solution
> that can be easily reproduced for multiple drivers beyond those
> supported by ixgbevf.
Glanced at PCI hotplug code. The hotplug events are triggered by PCI
hotplug controller and these event are defined in the controller spec.
It's hard to extend more events. Otherwise, we also need to add some
specific codes in the PCI hotplug core since it's only add and remove
PCI device when it gets events. It's also a challenge to modify Windows
hotplug codes. So we may need to find another way.
>
> Thanks.
>
> - Alex
On Sun, Nov 29, 2015 at 10:53 PM, Lan, Tianyu <[email protected]> wrote:
> On 11/26/2015 11:56 AM, Alexander Duyck wrote:
>>
>> > I am not saying you cannot modify the drivers, however what you are
>> doing is far too invasive. Do you seriously plan on modifying all of
>> the PCI device drivers out there in order to allow any device that
>> might be direct assigned to a port to support migration? I certainly
>> hope not. That is why I have said that this solution will not scale.
>
>
> Current drivers are not migration friendly. If the driver wants to
> support migration, it's necessary to be changed.
Modifying all of the drivers directly will not solve the issue though.
This is why I have suggested looking at possibly implementing
something like dma_mark_clean() which is used for ia64 architectures
to mark pages that were DMAed in as clean. In your case though you
would want to mark such pages as dirty so that the page migration will
notice them and move them over.
> RFC PATCH V1 presented our ideas about how to deal with MMIO, ring and
> DMA tracking during migration. These are common for most drivers and
> they maybe problematic in the previous version but can be corrected later.
They can only be corrected if the underlying assumptions are correct
and they aren't. Your solution would have never worked correctly.
The problem is you assume you can keep the device running when you are
migrating and you simply cannot. At some point you will always have
to stop the device in order to complete the migration, and you cannot
stop it before you have stopped your page tracking mechanism. So
unless the platform has an IOMMU that is somehow taking part in the
dirty page tracking you will not be able to stop the guest and then
the device, it will have to be the device and then the guest.
> Doing suspend and resume() may help to do migration easily but some
> devices requires low service down time. Especially network and I got
> that some cloud company promised less than 500ms network service downtime.
Honestly focusing on the downtime is getting the cart ahead of the
horse. First you need to be able to do this without corrupting system
memory and regardless of the state of the device. You haven't even
gotten to that state yet. Last I knew the device had to be up in
order for your migration to even work.
Many devices are very state driven. As such you cannot just freeze
them and restore them like you would regular device memory. That is
where something like suspend/resume comes in because it already takes
care of getting the device ready for halt, and then resume. Keep in
mind that those functions were meant to function on a device doing
something like a suspend to RAM or disk. This is not too far of from
what a migration is doing since you need to halt the guest before you
move it.
As such the first step is to make it so that we can do the current
bonding approach with one change. Specifically we want to leave the
device in the guest until the last portion of the migration instead of
having to remove it first. To that end I would suggest focusing on
solving the DMA problem via something like a dma_mark_clean() type
solution as that would be one issue resolved and we all would see an
immediate gain instead of just those users of the ixgbevf driver.
> So I think performance effect also should be taken into account when we
> design the framework.
What you are proposing I would call premature optimization. You need
to actually solve the problem before you can start optimizing things
and I don't see anything actually solved yet since your solution is
too unstable.
>>
>> What I am counter proposing seems like a very simple proposition. It
>> can be implemented in two steps.
>>
>> 1. Look at modifying dma_mark_clean(). It is a function called in
>> the sync and unmap paths of the lib/swiotlb.c. If you could somehow
>> modify it to take care of marking the pages you unmap for Rx as being
>> dirty it will get you a good way towards your goal as it will allow
>> you to continue to do DMA while you are migrating the VM.
>>
>> 2. Look at making use of the existing PCI suspend/resume calls that
>> are there to support PCI power management. They have everything
>> needed to allow you to pause and resume DMA for the device before and
>> after the migration while retaining the driver state. If you can
>> implement something that allows you to trigger these calls from the
>> PCI subsystem such as hot-plug then you would have a generic solution
>> that can be easily reproduced for multiple drivers beyond those
>> supported by ixgbevf.
>
>
> Glanced at PCI hotplug code. The hotplug events are triggered by PCI hotplug
> controller and these event are defined in the controller spec.
> It's hard to extend more events. Otherwise, we also need to add some
> specific codes in the PCI hotplug core since it's only add and remove
> PCI device when it gets events. It's also a challenge to modify Windows
> hotplug codes. So we may need to find another way.
For now we can use conventional hot-plug. Removing the device should
be fairly quick and I suspect it would only dirty a few megs of memory
so just using conventional hot-plug for now is probably workable. The
suspend/resume approach would be a follow-up in order to improve the
speed of migration since those functions are more lightweight then a
remove/probe.
- Alex
On 12/1/2015 12:07 AM, Alexander Duyck wrote:
> They can only be corrected if the underlying assumptions are correct
> and they aren't. Your solution would have never worked correctly.
> The problem is you assume you can keep the device running when you are
> migrating and you simply cannot. At some point you will always have
> to stop the device in order to complete the migration, and you cannot
> stop it before you have stopped your page tracking mechanism. So
> unless the platform has an IOMMU that is somehow taking part in the
> dirty page tracking you will not be able to stop the guest and then
> the device, it will have to be the device and then the guest.
>
>> >Doing suspend and resume() may help to do migration easily but some
>> >devices requires low service down time. Especially network and I got
>> >that some cloud company promised less than 500ms network service downtime.
> Honestly focusing on the downtime is getting the cart ahead of the
> horse. First you need to be able to do this without corrupting system
> memory and regardless of the state of the device. You haven't even
> gotten to that state yet. Last I knew the device had to be up in
> order for your migration to even work.
I think the issue is that the content of rx package delivered to stack
maybe changed during migration because the piece of memory won't be
migrated to new machine. This may confuse applications or stack. Current
dummy write solution can ensure the content of package won't change
after doing dummy write while the content maybe not received data if
migration happens before that point. We can recheck the content via
checksum or crc in the protocol after dummy write to ensure the content
is what VF received. I think stack has already done such checks and the
package will be abandoned if failed to pass through the check.
Another way is to tell all memory driver are using to Qemu and let Qemu
to migrate these memory after stopping VCPU and the device. This seems
safe but implementation maybe complex.
On Tue, Dec 01, 2015 at 11:04:31PM +0800, Lan, Tianyu wrote:
>
>
> On 12/1/2015 12:07 AM, Alexander Duyck wrote:
> >They can only be corrected if the underlying assumptions are correct
> >and they aren't. Your solution would have never worked correctly.
> >The problem is you assume you can keep the device running when you are
> >migrating and you simply cannot. At some point you will always have
> >to stop the device in order to complete the migration, and you cannot
> >stop it before you have stopped your page tracking mechanism. So
> >unless the platform has an IOMMU that is somehow taking part in the
> >dirty page tracking you will not be able to stop the guest and then
> >the device, it will have to be the device and then the guest.
> >
> >>>Doing suspend and resume() may help to do migration easily but some
> >>>devices requires low service down time. Especially network and I got
> >>>that some cloud company promised less than 500ms network service downtime.
> >Honestly focusing on the downtime is getting the cart ahead of the
> >horse. First you need to be able to do this without corrupting system
> >memory and regardless of the state of the device. You haven't even
> >gotten to that state yet. Last I knew the device had to be up in
> >order for your migration to even work.
>
> I think the issue is that the content of rx package delivered to stack maybe
> changed during migration because the piece of memory won't be migrated to
> new machine. This may confuse applications or stack. Current dummy write
> solution can ensure the content of package won't change after doing dummy
> write while the content maybe not received data if migration happens before
> that point. We can recheck the content via checksum or crc in the protocol
> after dummy write to ensure the content is what VF received. I think stack
> has already done such checks and the package will be abandoned if failed to
> pass through the check.
Most people nowdays rely on hardware checksums so I don't think this can
fly.
> Another way is to tell all memory driver are using to Qemu and let Qemu to
> migrate these memory after stopping VCPU and the device. This seems safe but
> implementation maybe complex.
Not really 100% safe. See below.
I think hiding these details behind dma_* API does have
some appeal. In any case, it gives us a good
terminology as it covers what most drivers do.
There are several components to this:
- dma_map_* needs to prevent page from
being migrated while device is running.
For example, expose some kind of bitmap from guest
to host, set bit there while page is mapped.
What happens if we stop the guest and some
bits are still set? See dma_alloc_coherent below
for some ideas.
- dma_unmap_* needs to mark page as dirty
This can be done by writing into a page.
- dma_sync_* needs to mark page as dirty
This is trickier as we can not change the data.
One solution is using atomics.
For example:
int x = ACCESS_ONCE(*p);
cmpxchg(p, x, x);
Seems to do a write without changing page
contents.
- dma_alloc_coherent memory (e.g. device rings)
must be migrated after device stopped modifying it.
Just stopping the VCPU is not enough:
you must make sure device is not changing it.
Or maybe the device has some kind of ring flush operation,
if there was a reasonably portable way to do this
(e.g. a flush capability could maybe be added to SRIOV)
then hypervisor could do this.
With existing devices,
either do it after device reset, or disable
memory access in the IOMMU. Maybe both.
In case you need to resume on source, you
really need to follow the same path
as on destination, preferably detecting
device reset and restoring the device
state.
A similar approach could work for dma_map_ above.
--
MST
On Tue, Dec 1, 2015 at 7:28 AM, Michael S. Tsirkin <[email protected]> wrote:
> On Tue, Dec 01, 2015 at 11:04:31PM +0800, Lan, Tianyu wrote:
>>
>>
>> On 12/1/2015 12:07 AM, Alexander Duyck wrote:
>> >They can only be corrected if the underlying assumptions are correct
>> >and they aren't. Your solution would have never worked correctly.
>> >The problem is you assume you can keep the device running when you are
>> >migrating and you simply cannot. At some point you will always have
>> >to stop the device in order to complete the migration, and you cannot
>> >stop it before you have stopped your page tracking mechanism. So
>> >unless the platform has an IOMMU that is somehow taking part in the
>> >dirty page tracking you will not be able to stop the guest and then
>> >the device, it will have to be the device and then the guest.
>> >
>> >>>Doing suspend and resume() may help to do migration easily but some
>> >>>devices requires low service down time. Especially network and I got
>> >>>that some cloud company promised less than 500ms network service downtime.
>> >Honestly focusing on the downtime is getting the cart ahead of the
>> >horse. First you need to be able to do this without corrupting system
>> >memory and regardless of the state of the device. You haven't even
>> >gotten to that state yet. Last I knew the device had to be up in
>> >order for your migration to even work.
>>
>> I think the issue is that the content of rx package delivered to stack maybe
>> changed during migration because the piece of memory won't be migrated to
>> new machine. This may confuse applications or stack. Current dummy write
>> solution can ensure the content of package won't change after doing dummy
>> write while the content maybe not received data if migration happens before
>> that point. We can recheck the content via checksum or crc in the protocol
>> after dummy write to ensure the content is what VF received. I think stack
>> has already done such checks and the package will be abandoned if failed to
>> pass through the check.
>
>
> Most people nowdays rely on hardware checksums so I don't think this can
> fly.
Correct. The checksum/crc approach will not work since it is possible
for a checksum to even be mangled in the case of some features such as
LRO or GRO.
>> Another way is to tell all memory driver are using to Qemu and let Qemu to
>> migrate these memory after stopping VCPU and the device. This seems safe but
>> implementation maybe complex.
>
> Not really 100% safe. See below.
>
> I think hiding these details behind dma_* API does have
> some appeal. In any case, it gives us a good
> terminology as it covers what most drivers do.
That was kind of my thought. If we were to build our own
dma_mark_clean() type function that will mark the DMA region dirty on
sync or unmap then that is half the battle right there as we would be
able to at least keep the regions consistent after they have left the
driver.
> There are several components to this:
> - dma_map_* needs to prevent page from
> being migrated while device is running.
> For example, expose some kind of bitmap from guest
> to host, set bit there while page is mapped.
> What happens if we stop the guest and some
> bits are still set? See dma_alloc_coherent below
> for some ideas.
Yeah, I could see something like this working. Maybe we could do
something like what was done for the NX bit and make use of the upper
order bits beyond the limits of the memory range to mark pages as
non-migratable?
I'm curious. What we have with a DMA mapped region is essentially
shared memory between the guest and the device. How would we resolve
something like this with IVSHMEM, or are we blocked there as well in
terms of migration?
> - dma_unmap_* needs to mark page as dirty
> This can be done by writing into a page.
>
> - dma_sync_* needs to mark page as dirty
> This is trickier as we can not change the data.
> One solution is using atomics.
> For example:
> int x = ACCESS_ONCE(*p);
> cmpxchg(p, x, x);
> Seems to do a write without changing page
> contents.
Like I said we can probably kill 2 birds with one stone by just
implementing our own dma_mark_clean() for x86 virtualized
environments.
I'd say we could take your solution one step further and just use 0
instead of bothering to read the value. After all it won't write the
area if the value at the offset is not 0. The only downside is that
this is a locked operation so we will take a pretty serious
performance penalty when this is active. As such my preference would
be to hide the code behind some static key that we could then switch
on in the event of a VM being migrated.
> - dma_alloc_coherent memory (e.g. device rings)
> must be migrated after device stopped modifying it.
> Just stopping the VCPU is not enough:
> you must make sure device is not changing it.
>
> Or maybe the device has some kind of ring flush operation,
> if there was a reasonably portable way to do this
> (e.g. a flush capability could maybe be added to SRIOV)
> then hypervisor could do this.
This is where things start to get messy. I was suggesting the
suspend/resume to resolve this bit, but it might be possible to also
deal with this via something like this via clearing the bus master
enable bit for the VF. If I am not mistaken that should disable MSI-X
interrupts and halt any DMA. That should work as long as you have
some mechanism that is tracking the pages in use for DMA.
> With existing devices,
> either do it after device reset, or disable
> memory access in the IOMMU. Maybe both.
The problem is that disabling the device at the IOMMU will start to
trigger master abort errors when it tries to access regions it no
longer has access to.
> In case you need to resume on source, you
> really need to follow the same path
> as on destination, preferably detecting
> device reset and restoring the device
> state.
The problem with detecting the reset is that you would likely have to
be polling to do something like that. I believe the fm10k driver
already has code like that in place where it will detect a reset as a
part of its watchdog, however the response time is something like 2
seconds for that. That was one of the reasons I preferred something
like hot-plug as that should be functioning as soon as the guest is up
and it is a mechanism that operates outside of the VF drivers.
On Tue, Dec 01, 2015 at 09:04:32AM -0800, Alexander Duyck wrote:
> On Tue, Dec 1, 2015 at 7:28 AM, Michael S. Tsirkin <[email protected]> wrote:
> > On Tue, Dec 01, 2015 at 11:04:31PM +0800, Lan, Tianyu wrote:
> >>
> >>
> >> On 12/1/2015 12:07 AM, Alexander Duyck wrote:
> >> >They can only be corrected if the underlying assumptions are correct
> >> >and they aren't. Your solution would have never worked correctly.
> >> >The problem is you assume you can keep the device running when you are
> >> >migrating and you simply cannot. At some point you will always have
> >> >to stop the device in order to complete the migration, and you cannot
> >> >stop it before you have stopped your page tracking mechanism. So
> >> >unless the platform has an IOMMU that is somehow taking part in the
> >> >dirty page tracking you will not be able to stop the guest and then
> >> >the device, it will have to be the device and then the guest.
> >> >
> >> >>>Doing suspend and resume() may help to do migration easily but some
> >> >>>devices requires low service down time. Especially network and I got
> >> >>>that some cloud company promised less than 500ms network service downtime.
> >> >Honestly focusing on the downtime is getting the cart ahead of the
> >> >horse. First you need to be able to do this without corrupting system
> >> >memory and regardless of the state of the device. You haven't even
> >> >gotten to that state yet. Last I knew the device had to be up in
> >> >order for your migration to even work.
> >>
> >> I think the issue is that the content of rx package delivered to stack maybe
> >> changed during migration because the piece of memory won't be migrated to
> >> new machine. This may confuse applications or stack. Current dummy write
> >> solution can ensure the content of package won't change after doing dummy
> >> write while the content maybe not received data if migration happens before
> >> that point. We can recheck the content via checksum or crc in the protocol
> >> after dummy write to ensure the content is what VF received. I think stack
> >> has already done such checks and the package will be abandoned if failed to
> >> pass through the check.
> >
> >
> > Most people nowdays rely on hardware checksums so I don't think this can
> > fly.
>
> Correct. The checksum/crc approach will not work since it is possible
> for a checksum to even be mangled in the case of some features such as
> LRO or GRO.
>
> >> Another way is to tell all memory driver are using to Qemu and let Qemu to
> >> migrate these memory after stopping VCPU and the device. This seems safe but
> >> implementation maybe complex.
> >
> > Not really 100% safe. See below.
> >
> > I think hiding these details behind dma_* API does have
> > some appeal. In any case, it gives us a good
> > terminology as it covers what most drivers do.
>
> That was kind of my thought. If we were to build our own
> dma_mark_clean() type function that will mark the DMA region dirty on
> sync or unmap then that is half the battle right there as we would be
> able to at least keep the regions consistent after they have left the
> driver.
>
> > There are several components to this:
> > - dma_map_* needs to prevent page from
> > being migrated while device is running.
> > For example, expose some kind of bitmap from guest
> > to host, set bit there while page is mapped.
> > What happens if we stop the guest and some
> > bits are still set? See dma_alloc_coherent below
> > for some ideas.
>
> Yeah, I could see something like this working. Maybe we could do
> something like what was done for the NX bit and make use of the upper
> order bits beyond the limits of the memory range to mark pages as
> non-migratable?
>
> I'm curious. What we have with a DMA mapped region is essentially
> shared memory between the guest and the device. How would we resolve
> something like this with IVSHMEM, or are we blocked there as well in
> terms of migration?
I have some ideas. Will post later.
> > - dma_unmap_* needs to mark page as dirty
> > This can be done by writing into a page.
> >
> > - dma_sync_* needs to mark page as dirty
> > This is trickier as we can not change the data.
> > One solution is using atomics.
> > For example:
> > int x = ACCESS_ONCE(*p);
> > cmpxchg(p, x, x);
> > Seems to do a write without changing page
> > contents.
>
> Like I said we can probably kill 2 birds with one stone by just
> implementing our own dma_mark_clean() for x86 virtualized
> environments.
>
> I'd say we could take your solution one step further and just use 0
> instead of bothering to read the value. After all it won't write the
> area if the value at the offset is not 0.
Really almost any atomic that has no side effect will do.
atomic or with 0
atomic and with ffffffff
It's just that cmpxchg already happens to have a portable
wrapper.
> The only downside is that
> this is a locked operation so we will take a pretty serious
> performance penalty when this is active. As such my preference would
> be to hide the code behind some static key that we could then switch
> on in the event of a VM being migrated.
> > - dma_alloc_coherent memory (e.g. device rings)
> > must be migrated after device stopped modifying it.
> > Just stopping the VCPU is not enough:
> > you must make sure device is not changing it.
> >
> > Or maybe the device has some kind of ring flush operation,
> > if there was a reasonably portable way to do this
> > (e.g. a flush capability could maybe be added to SRIOV)
> > then hypervisor could do this.
>
> This is where things start to get messy. I was suggesting the
> suspend/resume to resolve this bit, but it might be possible to also
> deal with this via something like this via clearing the bus master
> enable bit for the VF. If I am not mistaken that should disable MSI-X
> interrupts and halt any DMA. That should work as long as you have
> some mechanism that is tracking the pages in use for DMA.
A bigger issue is recovering afterwards.
> > With existing devices,
> > either do it after device reset, or disable
> > memory access in the IOMMU. Maybe both.
>
> The problem is that disabling the device at the IOMMU will start to
> trigger master abort errors when it tries to access regions it no
> longer has access to.
>
> > In case you need to resume on source, you
> > really need to follow the same path
> > as on destination, preferably detecting
> > device reset and restoring the device
> > state.
>
> The problem with detecting the reset is that you would likely have to
> be polling to do something like that.
We could some event to guest to notify it about this event
through a new or existing channel.
Or we could make it possible for userspace to trigger this,
then notify guest through the guest agent.
> I believe the fm10k driver
> already has code like that in place where it will detect a reset as a
> part of its watchdog, however the response time is something like 2
> seconds for that. That was one of the reasons I preferred something
> like hot-plug as that should be functioning as soon as the guest is up
> and it is a mechanism that operates outside of the VF drivers.
That's pretty minor.
A bigger issue is making sure guest does not crash
when device is suddenly reset under it's legs.
--
MST
On Tue, Dec 1, 2015 at 9:37 AM, Michael S. Tsirkin <[email protected]> wrote:
> On Tue, Dec 01, 2015 at 09:04:32AM -0800, Alexander Duyck wrote:
>> On Tue, Dec 1, 2015 at 7:28 AM, Michael S. Tsirkin <[email protected]> wrote:
>> > There are several components to this:
>> > - dma_map_* needs to prevent page from
>> > being migrated while device is running.
>> > For example, expose some kind of bitmap from guest
>> > to host, set bit there while page is mapped.
>> > What happens if we stop the guest and some
>> > bits are still set? See dma_alloc_coherent below
>> > for some ideas.
>>
>> Yeah, I could see something like this working. Maybe we could do
>> something like what was done for the NX bit and make use of the upper
>> order bits beyond the limits of the memory range to mark pages as
>> non-migratable?
>>
>> I'm curious. What we have with a DMA mapped region is essentially
>> shared memory between the guest and the device. How would we resolve
>> something like this with IVSHMEM, or are we blocked there as well in
>> terms of migration?
>
> I have some ideas. Will post later.
I look forward to it.
>> > - dma_unmap_* needs to mark page as dirty
>> > This can be done by writing into a page.
>> >
>> > - dma_sync_* needs to mark page as dirty
>> > This is trickier as we can not change the data.
>> > One solution is using atomics.
>> > For example:
>> > int x = ACCESS_ONCE(*p);
>> > cmpxchg(p, x, x);
>> > Seems to do a write without changing page
>> > contents.
>>
>> Like I said we can probably kill 2 birds with one stone by just
>> implementing our own dma_mark_clean() for x86 virtualized
>> environments.
>>
>> I'd say we could take your solution one step further and just use 0
>> instead of bothering to read the value. After all it won't write the
>> area if the value at the offset is not 0.
>
> Really almost any atomic that has no side effect will do.
> atomic or with 0
> atomic and with ffffffff
>
> It's just that cmpxchg already happens to have a portable
> wrapper.
I was originally thinking maybe an atomic_add with 0 would be the way
to go. Either way though we still are using a locked prefix and
having to dirty a cache line per page which is going to come at some
cost.
>> > - dma_alloc_coherent memory (e.g. device rings)
>> > must be migrated after device stopped modifying it.
>> > Just stopping the VCPU is not enough:
>> > you must make sure device is not changing it.
>> >
>> > Or maybe the device has some kind of ring flush operation,
>> > if there was a reasonably portable way to do this
>> > (e.g. a flush capability could maybe be added to SRIOV)
>> > then hypervisor could do this.
>>
>> This is where things start to get messy. I was suggesting the
>> suspend/resume to resolve this bit, but it might be possible to also
>> deal with this via something like this via clearing the bus master
>> enable bit for the VF. If I am not mistaken that should disable MSI-X
>> interrupts and halt any DMA. That should work as long as you have
>> some mechanism that is tracking the pages in use for DMA.
>
> A bigger issue is recovering afterwards.
Agreed.
>> > In case you need to resume on source, you
>> > really need to follow the same path
>> > as on destination, preferably detecting
>> > device reset and restoring the device
>> > state.
>>
>> The problem with detecting the reset is that you would likely have to
>> be polling to do something like that.
>
> We could some event to guest to notify it about this event
> through a new or existing channel.
>
> Or we could make it possible for userspace to trigger this,
> then notify guest through the guest agent.
The first thing that comes to mind would be to use something like PCIe
Advanced Error Reporting, however I don't know if we can put a
requirement on the system supporting the q35 machine type or not in
order to support migration.
>> I believe the fm10k driver
>> already has code like that in place where it will detect a reset as a
>> part of its watchdog, however the response time is something like 2
>> seconds for that. That was one of the reasons I preferred something
>> like hot-plug as that should be functioning as soon as the guest is up
>> and it is a mechanism that operates outside of the VF drivers.
>
> That's pretty minor.
> A bigger issue is making sure guest does not crash
> when device is suddenly reset under it's legs.
I know the ixgbevf driver should already have logic to address some of
that. If you look through the code there should be logic there for a
surprise removal support in ixgbevf. The only issue is that unlike
fm10k it will not restore itself after a resume or slot_reset call.
On Tue, Dec 01, 2015 at 10:36:33AM -0800, Alexander Duyck wrote:
> On Tue, Dec 1, 2015 at 9:37 AM, Michael S. Tsirkin <[email protected]> wrote:
> > On Tue, Dec 01, 2015 at 09:04:32AM -0800, Alexander Duyck wrote:
> >> On Tue, Dec 1, 2015 at 7:28 AM, Michael S. Tsirkin <[email protected]> wrote:
>
> >> > There are several components to this:
> >> > - dma_map_* needs to prevent page from
> >> > being migrated while device is running.
> >> > For example, expose some kind of bitmap from guest
> >> > to host, set bit there while page is mapped.
> >> > What happens if we stop the guest and some
> >> > bits are still set? See dma_alloc_coherent below
> >> > for some ideas.
> >>
> >> Yeah, I could see something like this working. Maybe we could do
> >> something like what was done for the NX bit and make use of the upper
> >> order bits beyond the limits of the memory range to mark pages as
> >> non-migratable?
> >>
> >> I'm curious. What we have with a DMA mapped region is essentially
> >> shared memory between the guest and the device. How would we resolve
> >> something like this with IVSHMEM, or are we blocked there as well in
> >> terms of migration?
> >
> > I have some ideas. Will post later.
>
> I look forward to it.
>
> >> > - dma_unmap_* needs to mark page as dirty
> >> > This can be done by writing into a page.
> >> >
> >> > - dma_sync_* needs to mark page as dirty
> >> > This is trickier as we can not change the data.
> >> > One solution is using atomics.
> >> > For example:
> >> > int x = ACCESS_ONCE(*p);
> >> > cmpxchg(p, x, x);
> >> > Seems to do a write without changing page
> >> > contents.
> >>
> >> Like I said we can probably kill 2 birds with one stone by just
> >> implementing our own dma_mark_clean() for x86 virtualized
> >> environments.
> >>
> >> I'd say we could take your solution one step further and just use 0
> >> instead of bothering to read the value. After all it won't write the
> >> area if the value at the offset is not 0.
> >
> > Really almost any atomic that has no side effect will do.
> > atomic or with 0
> > atomic and with ffffffff
> >
> > It's just that cmpxchg already happens to have a portable
> > wrapper.
>
> I was originally thinking maybe an atomic_add with 0 would be the way
> to go.
cmpxchg with any value too.
> Either way though we still are using a locked prefix and
> having to dirty a cache line per page which is going to come at some
> cost.
I agree. It's likely not necessary for everyone
to be doing this: only people that both
run within the VM and want migration to work
need to do this logging.
So set some module option to have driver tell hypervisor that it
supports logging. If bus mastering is enabled before this, migration is
blocked. Or even pass some flag from hypervisor so
driver can detect it needs to log writes.
I guess this could be put in device config somewhere,
though in practice it's a global thing, not a per device one, so
maybe we need some new channel to
pass this flag to guest. CPUID?
Or maybe we can put some kind of agent in the initrd
and use the existing guest agent channel after all.
agent in initrd could open up a lot of new possibilities.
> >> > - dma_alloc_coherent memory (e.g. device rings)
> >> > must be migrated after device stopped modifying it.
> >> > Just stopping the VCPU is not enough:
> >> > you must make sure device is not changing it.
> >> >
> >> > Or maybe the device has some kind of ring flush operation,
> >> > if there was a reasonably portable way to do this
> >> > (e.g. a flush capability could maybe be added to SRIOV)
> >> > then hypervisor could do this.
> >>
> >> This is where things start to get messy. I was suggesting the
> >> suspend/resume to resolve this bit, but it might be possible to also
> >> deal with this via something like this via clearing the bus master
> >> enable bit for the VF. If I am not mistaken that should disable MSI-X
> >> interrupts and halt any DMA. That should work as long as you have
> >> some mechanism that is tracking the pages in use for DMA.
> >
> > A bigger issue is recovering afterwards.
>
> Agreed.
>
> >> > In case you need to resume on source, you
> >> > really need to follow the same path
> >> > as on destination, preferably detecting
> >> > device reset and restoring the device
> >> > state.
> >>
> >> The problem with detecting the reset is that you would likely have to
> >> be polling to do something like that.
> >
> > We could some event to guest to notify it about this event
> > through a new or existing channel.
> >
> > Or we could make it possible for userspace to trigger this,
> > then notify guest through the guest agent.
>
> The first thing that comes to mind would be to use something like PCIe
> Advanced Error Reporting, however I don't know if we can put a
> requirement on the system supporting the q35 machine type or not in
> order to support migration.
You mean require pci express? This sounds quite reasonable.
> >> I believe the fm10k driver
> >> already has code like that in place where it will detect a reset as a
> >> part of its watchdog, however the response time is something like 2
> >> seconds for that. That was one of the reasons I preferred something
> >> like hot-plug as that should be functioning as soon as the guest is up
> >> and it is a mechanism that operates outside of the VF drivers.
> >
> > That's pretty minor.
> > A bigger issue is making sure guest does not crash
> > when device is suddenly reset under it's legs.
>
> I know the ixgbevf driver should already have logic to address some of
> that. If you look through the code there should be logic there for a
> surprise removal support in ixgbevf. The only issue is that unlike
> fm10k it will not restore itself after a resume or slot_reset call.
So if it's the question of driver installing a slot_reset handler, this
sounds quite reasonable.
It would be nice to be able to detect that guest supports
this removal, too, and block migration if it doesn't.
For example, show this capability
in an attribute in sysfs, make guest agent read that.
--
MST
Hi Michael & Alexander:
Thanks a lot for your comments and suggestions.
We still need to support Windows guest for migration and this is why our
patches keep all changes in the driver since it's impossible to change
Windows kernel.
Following is my idea to do DMA tracking.
Inject event to VF driver after memory iterate stage
and before stop VCPU and then VF driver marks dirty all
using DMA memory. The new allocated pages also need to
be marked dirty before stopping VCPU. All dirty memory
in this time slot will be migrated until stop-and-copy
stage. We also need to make sure to disable VF via clearing the
bus master enable bit for VF before migrating these memory.
The dma page allocated by VF driver also needs to reserve space
to do dummy write.
On 12/2/2015 7:44 PM, Michael S. Tsirkin wrote:
> On Tue, Dec 01, 2015 at 10:36:33AM -0800, Alexander Duyck wrote:
>> On Tue, Dec 1, 2015 at 9:37 AM, Michael S. Tsirkin <[email protected]> wrote:
>>> On Tue, Dec 01, 2015 at 09:04:32AM -0800, Alexander Duyck wrote:
>>>> On Tue, Dec 1, 2015 at 7:28 AM, Michael S. Tsirkin <[email protected]> wrote:
>>
>>>>> There are several components to this:
>>>>> - dma_map_* needs to prevent page from
>>>>> being migrated while device is running.
>>>>> For example, expose some kind of bitmap from guest
>>>>> to host, set bit there while page is mapped.
>>>>> What happens if we stop the guest and some
>>>>> bits are still set? See dma_alloc_coherent below
>>>>> for some ideas.
>>>>
>>>> Yeah, I could see something like this working. Maybe we could do
>>>> something like what was done for the NX bit and make use of the upper
>>>> order bits beyond the limits of the memory range to mark pages as
>>>> non-migratable?
>>>>
>>>> I'm curious. What we have with a DMA mapped region is essentially
>>>> shared memory between the guest and the device. How would we resolve
>>>> something like this with IVSHMEM, or are we blocked there as well in
>>>> terms of migration?
>>>
>>> I have some ideas. Will post later.
>>
>> I look forward to it.
>>
>>>>> - dma_unmap_* needs to mark page as dirty
>>>>> This can be done by writing into a page.
>>>>>
>>>>> - dma_sync_* needs to mark page as dirty
>>>>> This is trickier as we can not change the data.
>>>>> One solution is using atomics.
>>>>> For example:
>>>>> int x = ACCESS_ONCE(*p);
>>>>> cmpxchg(p, x, x);
>>>>> Seems to do a write without changing page
>>>>> contents.
>>>>
>>>> Like I said we can probably kill 2 birds with one stone by just
>>>> implementing our own dma_mark_clean() for x86 virtualized
>>>> environments.
>>>>
>>>> I'd say we could take your solution one step further and just use 0
>>>> instead of bothering to read the value. After all it won't write the
>>>> area if the value at the offset is not 0.
>>>
>>> Really almost any atomic that has no side effect will do.
>>> atomic or with 0
>>> atomic and with ffffffff
>>>
>>> It's just that cmpxchg already happens to have a portable
>>> wrapper.
>>
>> I was originally thinking maybe an atomic_add with 0 would be the way
>> to go.
>
> cmpxchg with any value too.
>
>> Either way though we still are using a locked prefix and
>> having to dirty a cache line per page which is going to come at some
>> cost.
>
> I agree. It's likely not necessary for everyone
> to be doing this: only people that both
> run within the VM and want migration to work
> need to do this logging.
>
> So set some module option to have driver tell hypervisor that it
> supports logging. If bus mastering is enabled before this, migration is
> blocked. Or even pass some flag from hypervisor so
> driver can detect it needs to log writes.
> I guess this could be put in device config somewhere,
> though in practice it's a global thing, not a per device one, so
> maybe we need some new channel to
> pass this flag to guest. CPUID?
> Or maybe we can put some kind of agent in the initrd
> and use the existing guest agent channel after all.
> agent in initrd could open up a lot of new possibilities.
>
>
>>>>> - dma_alloc_coherent memory (e.g. device rings)
>>>>> must be migrated after device stopped modifying it.
>>>>> Just stopping the VCPU is not enough:
>>>>> you must make sure device is not changing it.
>>>>>
>>>>> Or maybe the device has some kind of ring flush operation,
>>>>> if there was a reasonably portable way to do this
>>>>> (e.g. a flush capability could maybe be added to SRIOV)
>>>>> then hypervisor could do this.
>>>>
>>>> This is where things start to get messy. I was suggesting the
>>>> suspend/resume to resolve this bit, but it might be possible to also
>>>> deal with this via something like this via clearing the bus master
>>>> enable bit for the VF. If I am not mistaken that should disable MSI-X
>>>> interrupts and halt any DMA. That should work as long as you have
>>>> some mechanism that is tracking the pages in use for DMA.
>>>
>>> A bigger issue is recovering afterwards.
>>
>> Agreed.
>>
>>>>> In case you need to resume on source, you
>>>>> really need to follow the same path
>>>>> as on destination, preferably detecting
>>>>> device reset and restoring the device
>>>>> state.
>>>>
>>>> The problem with detecting the reset is that you would likely have to
>>>> be polling to do something like that.
>>>
>>> We could some event to guest to notify it about this event
>>> through a new or existing channel.
>>>
>>> Or we could make it possible for userspace to trigger this,
>>> then notify guest through the guest agent.
>>
>> The first thing that comes to mind would be to use something like PCIe
>> Advanced Error Reporting, however I don't know if we can put a
>> requirement on the system supporting the q35 machine type or not in
>> order to support migration.
>
> You mean require pci express? This sounds quite reasonable.
>
>>>> I believe the fm10k driver
>>>> already has code like that in place where it will detect a reset as a
>>>> part of its watchdog, however the response time is something like 2
>>>> seconds for that. That was one of the reasons I preferred something
>>>> like hot-plug as that should be functioning as soon as the guest is up
>>>> and it is a mechanism that operates outside of the VF drivers.
>>>
>>> That's pretty minor.
>>> A bigger issue is making sure guest does not crash
>>> when device is suddenly reset under it's legs.
>>
>> I know the ixgbevf driver should already have logic to address some of
>> that. If you look through the code there should be logic there for a
>> surprise removal support in ixgbevf. The only issue is that unlike
>> fm10k it will not restore itself after a resume or slot_reset call.
>
> So if it's the question of driver installing a slot_reset handler, this
> sounds quite reasonable.
>
> It would be nice to be able to detect that guest supports
> this removal, too, and block migration if it doesn't.
> For example, show this capability
> in an attribute in sysfs, make guest agent read that.
>
On 12/04/2015 08:32 AM, Lan, Tianyu wrote:
> Hi Michael & Alexander:
> Thanks a lot for your comments and suggestions.
>
> We still need to support Windows guest for migration and this is why our
> patches keep all changes in the driver since it's impossible to change
> Windows kernel.
That is a poor argument. I highly doubt Microsoft is interested in
having to modify all of the drivers that will support direct assignment
in order to support migration. They would likely request something
similar to what I have in that they will want a way to do DMA tracking
with minimal modification required to the drivers.
> Following is my idea to do DMA tracking.
>
> Inject event to VF driver after memory iterate stage
> and before stop VCPU and then VF driver marks dirty all
> using DMA memory. The new allocated pages also need to
> be marked dirty before stopping VCPU. All dirty memory
> in this time slot will be migrated until stop-and-copy
> stage. We also need to make sure to disable VF via clearing the
> bus master enable bit for VF before migrating these memory.
The ordering of your explanation here doesn't quite work. What needs to
happen is that you have to disable DMA and then mark the pages as dirty.
What the disabling of the BME does is signal to the hypervisor that
the device is now stopped. The ixgbevf_suspend call already supported
by the driver is almost exactly what is needed to take care of something
like this.
The question is how we would go about triggering it. I really don't
think the PCI configuration space approach is the right idea. I wonder
if we couldn't get away with some sort of ACPI event instead. We
already require ACPI support in order to shut down the system
gracefully, I wonder if we couldn't get away with something similar in
order to suspend/resume the direct assigned devices gracefully.
> The dma page allocated by VF driver also needs to reserve space
> to do dummy write.
No, this will not work. If for example you have a VF driver allocating
memory for a 9K receive how will that work? It isn't as if you can poke
a hole in the contiguous memory.
On 12/5/2015 1:07 AM, Alexander Duyck wrote:
>>
>> We still need to support Windows guest for migration and this is why our
>> patches keep all changes in the driver since it's impossible to change
>> Windows kernel.
>
> That is a poor argument. I highly doubt Microsoft is interested in
> having to modify all of the drivers that will support direct assignment
> in order to support migration. They would likely request something
> similar to what I have in that they will want a way to do DMA tracking
> with minimal modification required to the drivers.
This totally depends on the NIC or other devices' vendors and they
should make decision to support migration or not. If yes, they would
modify driver.
If just target to call suspend/resume during migration, the feature will
be meaningless. Most cases don't want to affect user during migration
a lot and so the service down time is vital. Our target is to apply
SRIOV NIC passthough to cloud service and NFV(network functions
virtualization) projects which are sensitive to network performance
and stability. From my opinion, We should give a change for device
driver to implement itself migration job. Call suspend and resume
callback in the driver if it doesn't care the performance during migration.
>
>> Following is my idea to do DMA tracking.
>>
>> Inject event to VF driver after memory iterate stage
>> and before stop VCPU and then VF driver marks dirty all
>> using DMA memory. The new allocated pages also need to
>> be marked dirty before stopping VCPU. All dirty memory
>> in this time slot will be migrated until stop-and-copy
>> stage. We also need to make sure to disable VF via clearing the
>> bus master enable bit for VF before migrating these memory.
>
> The ordering of your explanation here doesn't quite work. What needs to
> happen is that you have to disable DMA and then mark the pages as dirty.
> What the disabling of the BME does is signal to the hypervisor that
> the device is now stopped. The ixgbevf_suspend call already supported
> by the driver is almost exactly what is needed to take care of something
> like this.
This is why I hope to reserve a piece of space in the dma page to do
dummy write. This can help to mark page dirty while not require to stop
DMA and not race with DMA data.
If can't do that, we have to stop DMA in a short time to mark all dma
pages dirty and then reenable it. I am not sure how much we can get by
this way to track all DMA memory with device running during migration. I
need to do some tests and compare results with stop DMA diretly at last
stage during migration.
>
> The question is how we would go about triggering it. I really don't
> think the PCI configuration space approach is the right idea.
> I wonder
> if we couldn't get away with some sort of ACPI event instead. We
> already require ACPI support in order to shut down the system
> gracefully, I wonder if we couldn't get away with something similar in
> order to suspend/resume the direct assigned devices gracefully.
>
I don't think there is such events in the current spec.
Otherwise, There are two kinds of suspend/resume callbacks.
1) System suspend/resume called during S2RAM and S2DISK.
2) Runtime suspend/resume called by pm core when device is idle.
If you want to do what you mentioned, you have to change PM core and
ACPI spec.
>> The dma page allocated by VF driver also needs to reserve space
>> to do dummy write.
>
> No, this will not work. If for example you have a VF driver allocating
> memory for a 9K receive how will that work? It isn't as if you can poke
> a hole in the contiguous memory.
On Mon, Dec 7, 2015 at 7:40 AM, Lan, Tianyu <[email protected]> wrote:
> On 12/5/2015 1:07 AM, Alexander Duyck wrote:
>>>
>>>
>>> We still need to support Windows guest for migration and this is why our
>>> patches keep all changes in the driver since it's impossible to change
>>> Windows kernel.
>>
>>
>> That is a poor argument. I highly doubt Microsoft is interested in
>> having to modify all of the drivers that will support direct assignment
>> in order to support migration. They would likely request something
>> similar to what I have in that they will want a way to do DMA tracking
>> with minimal modification required to the drivers.
>
>
> This totally depends on the NIC or other devices' vendors and they
> should make decision to support migration or not. If yes, they would
> modify driver.
Having to modify every driver that wants to support live migration is
a bit much. In addition I don't see this being limited only to NIC
devices. You can direct assign a number of different devices, your
solution cannot be specific to NICs.
> If just target to call suspend/resume during migration, the feature will
> be meaningless. Most cases don't want to affect user during migration
> a lot and so the service down time is vital. Our target is to apply
> SRIOV NIC passthough to cloud service and NFV(network functions
> virtualization) projects which are sensitive to network performance
> and stability. From my opinion, We should give a change for device
> driver to implement itself migration job. Call suspend and resume
> callback in the driver if it doesn't care the performance during migration.
The suspend/resume callback should be efficient in terms of time.
After all we don't want the system to stall for a long period of time
when it should be either running or asleep. Having it burn cycles in
a power state limbo doesn't do anyone any good. If nothing else maybe
it will help to push the vendors to speed up those functions which
then benefit migration and the system sleep states.
Also you keep assuming you can keep the device running while you do
the migration and you can't. You are going to corrupt the memory if
you do, and you have yet to provide any means to explain how you are
going to solve that.
>
>>
>>> Following is my idea to do DMA tracking.
>>>
>>> Inject event to VF driver after memory iterate stage
>>> and before stop VCPU and then VF driver marks dirty all
>>> using DMA memory. The new allocated pages also need to
>>> be marked dirty before stopping VCPU. All dirty memory
>>> in this time slot will be migrated until stop-and-copy
>>> stage. We also need to make sure to disable VF via clearing the
>>> bus master enable bit for VF before migrating these memory.
>>
>>
>> The ordering of your explanation here doesn't quite work. What needs to
>> happen is that you have to disable DMA and then mark the pages as dirty.
>> What the disabling of the BME does is signal to the hypervisor that
>> the device is now stopped. The ixgbevf_suspend call already supported
>> by the driver is almost exactly what is needed to take care of something
>> like this.
>
>
> This is why I hope to reserve a piece of space in the dma page to do dummy
> write. This can help to mark page dirty while not require to stop DMA and
> not race with DMA data.
You can't and it will still race. What concerns me is that your
patches and the document you referenced earlier show a considerable
lack of understanding about how DMA and device drivers work. There is
a reason why device drivers have so many memory barriers and the like
in them. The fact is when you have CPU and a device both accessing
memory things have to be done in a very specific order and you cannot
violate that.
If you have a contiguous block of memory you expect the device to
write into you cannot just poke a hole in it. Such a situation is not
supported by any hardware that I am aware of.
As far as writing to dirty the pages it only works so long as you halt
the DMA and then mark the pages dirty. It has to be in that order.
Any other order will result in data corruption and I am sure the NFV
customers definitely don't want that.
> If can't do that, we have to stop DMA in a short time to mark all dma
> pages dirty and then reenable it. I am not sure how much we can get by
> this way to track all DMA memory with device running during migration. I
> need to do some tests and compare results with stop DMA diretly at last
> stage during migration.
We have to halt the DMA before we can complete the migration. So
please feel free to test this.
In addition I still feel you would be better off taking this in
smaller steps. I still say your first step would be to come up with a
generic solution for the dirty page tracking like the dma_mark_clean()
approach I had mentioned earlier. If I get time I might try to take
care of it myself later this week since you don't seem to agree with
that approach.
>>
>> The question is how we would go about triggering it. I really don't
>> think the PCI configuration space approach is the right idea.
>> I wonder
>> if we couldn't get away with some sort of ACPI event instead. We
>> already require ACPI support in order to shut down the system
>> gracefully, I wonder if we couldn't get away with something similar in
>> order to suspend/resume the direct assigned devices gracefully.
>>
>
> I don't think there is such events in the current spec.
> Otherwise, There are two kinds of suspend/resume callbacks.
> 1) System suspend/resume called during S2RAM and S2DISK.
> 2) Runtime suspend/resume called by pm core when device is idle.
> If you want to do what you mentioned, you have to change PM core and
> ACPI spec.
The thought I had was to somehow try to move the direct assigned
devices into their own power domain and then simulate a AC power event
where that domain is switched off. However I don't know if there are
ACPI events to support that since the power domain code currently only
appears to be in use for runtime power management.
That had also given me the thought to look at something like runtime
power management for the VFs. We would need to do a runtime
suspend/resume. The only problem is I don't know if there is any way
to get the VFs to do a quick wakeup. It might be worthwhile looking
at trying to check with the ACPI experts out there to see if there is
anything we can do as bypassing having to use the configuration space
mechanism to signal this would definitely be worth it.
>>> The dma page allocated by VF driver also needs to reserve space
>>> to do dummy write.
>>
>>
>> No, this will not work. If for example you have a VF driver allocating
>> memory for a 9K receive how will that work? It isn't as if you can poke
>> a hole in the contiguous memory.
This is the bit that makes your "poke a hole" solution not portable to
other drivers. I don't know if you overlooked it but for many NICs
jumbo frames means using large memory allocations to receive the data.
That is the way ixgbevf was up until about a year ago so you cannot
expect all the drivers that will want migration support to allow a
space for you to write to. In addition some storage drivers have to
map an entire page, that means there is no room for a hole there.
- Alex
On Mon, Dec 07, 2015 at 09:12:08AM -0800, Alexander Duyck wrote:
> On Mon, Dec 7, 2015 at 7:40 AM, Lan, Tianyu <[email protected]> wrote:
> > On 12/5/2015 1:07 AM, Alexander Duyck wrote:
> >>>
> >>>
> >>> We still need to support Windows guest for migration and this is why our
> >>> patches keep all changes in the driver since it's impossible to change
> >>> Windows kernel.
> >>
> >>
> >> That is a poor argument. I highly doubt Microsoft is interested in
> >> having to modify all of the drivers that will support direct assignment
> >> in order to support migration. They would likely request something
> >> similar to what I have in that they will want a way to do DMA tracking
> >> with minimal modification required to the drivers.
> >
> >
> > This totally depends on the NIC or other devices' vendors and they
> > should make decision to support migration or not. If yes, they would
> > modify driver.
>
> Having to modify every driver that wants to support live migration is
> a bit much. In addition I don't see this being limited only to NIC
> devices. You can direct assign a number of different devices, your
> solution cannot be specific to NICs.
>
> > If just target to call suspend/resume during migration, the feature will
> > be meaningless. Most cases don't want to affect user during migration
> > a lot and so the service down time is vital. Our target is to apply
> > SRIOV NIC passthough to cloud service and NFV(network functions
> > virtualization) projects which are sensitive to network performance
> > and stability. From my opinion, We should give a change for device
> > driver to implement itself migration job. Call suspend and resume
> > callback in the driver if it doesn't care the performance during migration.
>
> The suspend/resume callback should be efficient in terms of time.
> After all we don't want the system to stall for a long period of time
> when it should be either running or asleep. Having it burn cycles in
> a power state limbo doesn't do anyone any good. If nothing else maybe
> it will help to push the vendors to speed up those functions which
> then benefit migration and the system sleep states.
>
> Also you keep assuming you can keep the device running while you do
> the migration and you can't. You are going to corrupt the memory if
> you do, and you have yet to provide any means to explain how you are
> going to solve that.
>
>
> >
> >>
> >>> Following is my idea to do DMA tracking.
> >>>
> >>> Inject event to VF driver after memory iterate stage
> >>> and before stop VCPU and then VF driver marks dirty all
> >>> using DMA memory. The new allocated pages also need to
> >>> be marked dirty before stopping VCPU. All dirty memory
> >>> in this time slot will be migrated until stop-and-copy
> >>> stage. We also need to make sure to disable VF via clearing the
> >>> bus master enable bit for VF before migrating these memory.
> >>
> >>
> >> The ordering of your explanation here doesn't quite work. What needs to
> >> happen is that you have to disable DMA and then mark the pages as dirty.
> >> What the disabling of the BME does is signal to the hypervisor that
> >> the device is now stopped. The ixgbevf_suspend call already supported
> >> by the driver is almost exactly what is needed to take care of something
> >> like this.
> >
> >
> > This is why I hope to reserve a piece of space in the dma page to do dummy
> > write. This can help to mark page dirty while not require to stop DMA and
> > not race with DMA data.
>
> You can't and it will still race. What concerns me is that your
> patches and the document you referenced earlier show a considerable
> lack of understanding about how DMA and device drivers work. There is
> a reason why device drivers have so many memory barriers and the like
> in them. The fact is when you have CPU and a device both accessing
> memory things have to be done in a very specific order and you cannot
> violate that.
>
> If you have a contiguous block of memory you expect the device to
> write into you cannot just poke a hole in it. Such a situation is not
> supported by any hardware that I am aware of.
>
> As far as writing to dirty the pages it only works so long as you halt
> the DMA and then mark the pages dirty. It has to be in that order.
> Any other order will result in data corruption and I am sure the NFV
> customers definitely don't want that.
>
> > If can't do that, we have to stop DMA in a short time to mark all dma
> > pages dirty and then reenable it. I am not sure how much we can get by
> > this way to track all DMA memory with device running during migration. I
> > need to do some tests and compare results with stop DMA diretly at last
> > stage during migration.
>
> We have to halt the DMA before we can complete the migration. So
> please feel free to test this.
>
> In addition I still feel you would be better off taking this in
> smaller steps. I still say your first step would be to come up with a
> generic solution for the dirty page tracking like the dma_mark_clean()
> approach I had mentioned earlier. If I get time I might try to take
> care of it myself later this week since you don't seem to agree with
> that approach.
Or even try to look at the dirty bit in the VT-D PTEs
on the host. See the mail I have just sent.
Might be slower, or might be faster, but is completely
transparent.
> >>
> >> The question is how we would go about triggering it. I really don't
> >> think the PCI configuration space approach is the right idea.
> >> I wonder
> >> if we couldn't get away with some sort of ACPI event instead. We
> >> already require ACPI support in order to shut down the system
> >> gracefully, I wonder if we couldn't get away with something similar in
> >> order to suspend/resume the direct assigned devices gracefully.
> >>
> >
> > I don't think there is such events in the current spec.
> > Otherwise, There are two kinds of suspend/resume callbacks.
> > 1) System suspend/resume called during S2RAM and S2DISK.
> > 2) Runtime suspend/resume called by pm core when device is idle.
> > If you want to do what you mentioned, you have to change PM core and
> > ACPI spec.
>
> The thought I had was to somehow try to move the direct assigned
> devices into their own power domain and then simulate a AC power event
> where that domain is switched off. However I don't know if there are
> ACPI events to support that since the power domain code currently only
> appears to be in use for runtime power management.
>
> That had also given me the thought to look at something like runtime
> power management for the VFs. We would need to do a runtime
> suspend/resume. The only problem is I don't know if there is any way
> to get the VFs to do a quick wakeup. It might be worthwhile looking
> at trying to check with the ACPI experts out there to see if there is
> anything we can do as bypassing having to use the configuration space
> mechanism to signal this would definitely be worth it.
I don't much like this idea because it relies on the
device being exactly the same across source/destination.
After all, this is always true for suspend/resume.
Most users do not have control over this, and you would
often get sightly different versions of firmware,
etc without noticing.
I think we should first see how far along we can get
by doing a full device reset, and only carrying over
high level state such as IP, MAC, ARP cache etc.
> >>> The dma page allocated by VF driver also needs to reserve space
> >>> to do dummy write.
> >>
> >>
> >> No, this will not work. If for example you have a VF driver allocating
> >> memory for a 9K receive how will that work? It isn't as if you can poke
> >> a hole in the contiguous memory.
>
> This is the bit that makes your "poke a hole" solution not portable to
> other drivers. I don't know if you overlooked it but for many NICs
> jumbo frames means using large memory allocations to receive the data.
> That is the way ixgbevf was up until about a year ago so you cannot
> expect all the drivers that will want migration support to allow a
> space for you to write to. In addition some storage drivers have to
> map an entire page, that means there is no room for a hole there.
>
> - Alex
I think we could start with the atomic idea.
cmpxchg(ptr, X, X)
for any value of X will never corrupt any memory.
Then DMA API could gain a flag that says there actually is a hole to
write into, so you can do
ACESS_ONCE(*ptr)=0;
or where there is no concurrent access so you can do
ACESS_ONCE(*ptr)=ACCESS_ONCE(*ptr);
A driver that sets one of these flags will gain a bit of performance.
--
MST
On Mon, Dec 7, 2015 at 9:39 AM, Michael S. Tsirkin <[email protected]> wrote:
> On Mon, Dec 07, 2015 at 09:12:08AM -0800, Alexander Duyck wrote:
>> On Mon, Dec 7, 2015 at 7:40 AM, Lan, Tianyu <[email protected]> wrote:
>> > On 12/5/2015 1:07 AM, Alexander Duyck wrote:
>> > If can't do that, we have to stop DMA in a short time to mark all dma
>> > pages dirty and then reenable it. I am not sure how much we can get by
>> > this way to track all DMA memory with device running during migration. I
>> > need to do some tests and compare results with stop DMA diretly at last
>> > stage during migration.
>>
>> We have to halt the DMA before we can complete the migration. So
>> please feel free to test this.
>>
>> In addition I still feel you would be better off taking this in
>> smaller steps. I still say your first step would be to come up with a
>> generic solution for the dirty page tracking like the dma_mark_clean()
>> approach I had mentioned earlier. If I get time I might try to take
>> care of it myself later this week since you don't seem to agree with
>> that approach.
>
> Or even try to look at the dirty bit in the VT-D PTEs
> on the host. See the mail I have just sent.
> Might be slower, or might be faster, but is completely
> transparent.
I just saw it and I am looking over the VTd spec now. It looks like
there might be some performance impacts if software is changing the
PTEs since then the VTd harwdare cannot cache them. I still have to
do some more reading though so I can fully understand the impacts.
>> >>
>> >> The question is how we would go about triggering it. I really don't
>> >> think the PCI configuration space approach is the right idea.
>> >> I wonder
>> >> if we couldn't get away with some sort of ACPI event instead. We
>> >> already require ACPI support in order to shut down the system
>> >> gracefully, I wonder if we couldn't get away with something similar in
>> >> order to suspend/resume the direct assigned devices gracefully.
>> >>
>> >
>> > I don't think there is such events in the current spec.
>> > Otherwise, There are two kinds of suspend/resume callbacks.
>> > 1) System suspend/resume called during S2RAM and S2DISK.
>> > 2) Runtime suspend/resume called by pm core when device is idle.
>> > If you want to do what you mentioned, you have to change PM core and
>> > ACPI spec.
>>
>> The thought I had was to somehow try to move the direct assigned
>> devices into their own power domain and then simulate a AC power event
>> where that domain is switched off. However I don't know if there are
>> ACPI events to support that since the power domain code currently only
>> appears to be in use for runtime power management.
>>
>> That had also given me the thought to look at something like runtime
>> power management for the VFs. We would need to do a runtime
>> suspend/resume. The only problem is I don't know if there is any way
>> to get the VFs to do a quick wakeup. It might be worthwhile looking
>> at trying to check with the ACPI experts out there to see if there is
>> anything we can do as bypassing having to use the configuration space
>> mechanism to signal this would definitely be worth it.
>
> I don't much like this idea because it relies on the
> device being exactly the same across source/destination.
> After all, this is always true for suspend/resume.
> Most users do not have control over this, and you would
> often get sightly different versions of firmware,
> etc without noticing.
The original code was operating on that assumption as well. That is
kind of why I suggested suspend/resume rather than reinventing the
wheel.
> I think we should first see how far along we can get
> by doing a full device reset, and only carrying over
> high level state such as IP, MAC, ARP cache etc.
One advantage of the suspend/resume approach is that it is compatible
with a full reset. The suspend/resume approach assumes the device
goes through a D0->D3->D0 reset as a part of transitioning between the
system states.
I do admit though that the PCI spec says you aren't supposed to be
hot-swapping devices while the system is in a sleep state so odds are
you would encounter issues if the device changed in any significant
way.
>> >>> The dma page allocated by VF driver also needs to reserve space
>> >>> to do dummy write.
>> >>
>> >>
>> >> No, this will not work. If for example you have a VF driver allocating
>> >> memory for a 9K receive how will that work? It isn't as if you can poke
>> >> a hole in the contiguous memory.
>>
>> This is the bit that makes your "poke a hole" solution not portable to
>> other drivers. I don't know if you overlooked it but for many NICs
>> jumbo frames means using large memory allocations to receive the data.
>> That is the way ixgbevf was up until about a year ago so you cannot
>> expect all the drivers that will want migration support to allow a
>> space for you to write to. In addition some storage drivers have to
>> map an entire page, that means there is no room for a hole there.
>>
>> - Alex
>
> I think we could start with the atomic idea.
> cmpxchg(ptr, X, X)
> for any value of X will never corrupt any memory.
Right pretty much any atomic operation that will not result in the
value being changed will do.
> Then DMA API could gain a flag that says there actually is a hole to
> write into, so you can do
>
> ACESS_ONCE(*ptr)=0;
>
> or where there is no concurrent access so you can do
>
> ACESS_ONCE(*ptr)=ACCESS_ONCE(*ptr);
>
> A driver that sets one of these flags will gain a bit of performance.
I don't see the memory hole thing working out very well. It isn't
very portable and will just make a mess of things in general. I tend
to prefer the cmpxchg(ptr, 0, 0) approach. Yes it adds a locked
operation but the fact is we are probably going to be taking a fairly
heavy hit anyway as the cache line is likely not stored in the L1
cache.
The part I am wondering about is if there is some way for us to switch
this on/off. Having to always dirty a cache line in each DMA page
isn't exactly desirable and obviously we don't need it if we are not
KVM/Xen and we are not in the middle of a migration. For tests where
you are running just netperf and the like the performance effect won't
even show up. It will increase CPU utilization by a fraction of a
percent. It isn't till you start focusing on small packets or
40/100Gbs that something like this would be an issue.
If we can get VTd on the host to take care of the dirty page tracking
for us then that would likely work out even better because we could
probably bulk the accesses. So each time we go out and check the
guest for dirty pages we could do it in two passes, one for the pages
the guest dirtied and then one for the pages the device dirtied.
- Alex
On 12/8/2015 1:12 AM, Alexander Duyck wrote:
> On Mon, Dec 7, 2015 at 7:40 AM, Lan, Tianyu <[email protected]> wrote:
>> On 12/5/2015 1:07 AM, Alexander Duyck wrote:
>>>>
>>>>
>>>> We still need to support Windows guest for migration and this is why our
>>>> patches keep all changes in the driver since it's impossible to change
>>>> Windows kernel.
>>>
>>>
>>> That is a poor argument. I highly doubt Microsoft is interested in
>>> having to modify all of the drivers that will support direct assignment
>>> in order to support migration. They would likely request something
>>> similar to what I have in that they will want a way to do DMA tracking
>>> with minimal modification required to the drivers.
>>
>>
>> This totally depends on the NIC or other devices' vendors and they
>> should make decision to support migration or not. If yes, they would
>> modify driver.
>
> Having to modify every driver that wants to support live migration is
> a bit much. In addition I don't see this being limited only to NIC
> devices. You can direct assign a number of different devices, your
> solution cannot be specific to NICs.
We are also adding such migration support for QAT device and so our
solution will not just be limit to NIC. Now just is the beginning.
We can't limit user to only use Linux guest. So the migration feature
should work for both Windows and Linux guest.
>
>> If just target to call suspend/resume during migration, the feature will
>> be meaningless. Most cases don't want to affect user during migration
>> a lot and so the service down time is vital. Our target is to apply
>> SRIOV NIC passthough to cloud service and NFV(network functions
>> virtualization) projects which are sensitive to network performance
>> and stability. From my opinion, We should give a change for device
>> driver to implement itself migration job. Call suspend and resume
>> callback in the driver if it doesn't care the performance during migration.
>
> The suspend/resume callback should be efficient in terms of time.
> After all we don't want the system to stall for a long period of time
> when it should be either running or asleep. Having it burn cycles in
> a power state limbo doesn't do anyone any good. If nothing else maybe
> it will help to push the vendors to speed up those functions which
> then benefit migration and the system sleep states.
If we can benefit both migration and suspend, that would be wonderful.
But migration and system pm is still different. Just for example,
driver doesn't need to put device into deep D-status during migration
and host can do this after migration while it's essential for
system sleep. PCI configure space and interrupt config is emulated by
Qemu and Qemu can migrate these configures to new machine. Driver
doesn't need to deal with such thing. So I think migration still needs a
different callback or different code path than device suspend/resume.
Another concern is that we have to rework PM core ore PCI bus driver
to call suspend/resume for passthrough devices during migration. This
also blocks new feature works on the Windows.
>
> Also you keep assuming you can keep the device running while you do
> the migration and you can't. You are going to corrupt the memory if
> you do, and you have yet to provide any means to explain how you are
> going to solve that.
The main problem is tracking DMA issue. I will repose my solution in the
new thread for discussion. If not way to mark DMA page dirty when
DMA is enabled, we have to stop DMA for a small time to do that at the
last stage.
>
>>
>>>
>>>> Following is my idea to do DMA tracking.
>>>>
>>>> Inject event to VF driver after memory iterate stage
>>>> and before stop VCPU and then VF driver marks dirty all
>>>> using DMA memory. The new allocated pages also need to
>>>> be marked dirty before stopping VCPU. All dirty memory
>>>> in this time slot will be migrated until stop-and-copy
>>>> stage. We also need to make sure to disable VF via clearing the
>>>> bus master enable bit for VF before migrating these memory.
>>>
>>>
>>> The ordering of your explanation here doesn't quite work. What needs to
>>> happen is that you have to disable DMA and then mark the pages as dirty.
>>> What the disabling of the BME does is signal to the hypervisor that
>>> the device is now stopped. The ixgbevf_suspend call already supported
>>> by the driver is almost exactly what is needed to take care of something
>>> like this.
>>
>>
>> This is why I hope to reserve a piece of space in the dma page to do dummy
>> write. This can help to mark page dirty while not require to stop DMA and
>> not race with DMA data.
>
> You can't and it will still race. What concerns me is that your
> patches and the document you referenced earlier show a considerable
> lack of understanding about how DMA and device drivers work. There is
> a reason why device drivers have so many memory barriers and the like
> in them. The fact is when you have CPU and a device both accessing
> memory things have to be done in a very specific order and you cannot
> violate that.
>
> If you have a contiguous block of memory you expect the device to
> write into you cannot just poke a hole in it. Such a situation is not
> supported by any hardware that I am aware of.
>
> As far as writing to dirty the pages it only works so long as you halt
> the DMA and then mark the pages dirty. It has to be in that order.
> Any other order will result in data corruption and I am sure the NFV
> customers definitely don't want that.
>
>> If can't do that, we have to stop DMA in a short time to mark all dma
>> pages dirty and then reenable it. I am not sure how much we can get by
>> this way to track all DMA memory with device running during migration. I
>> need to do some tests and compare results with stop DMA diretly at last
>> stage during migration.
>
> We have to halt the DMA before we can complete the migration. So
> please feel free to test this.
If we can inject interrupt to notify driver just before stopping VCPU
and then stop DMA, it will not affect service down time a lot since the
network still will be down when stop VCPU.
So the question will be converted to how and when notify device driver
about migration status.
>
> In addition I still feel you would be better off taking this in
> smaller steps. I still say your first step would be to come up with a
> generic solution for the dirty page tracking like the dma_mark_clean()
> approach I had mentioned earlier. If I get time I might try to take
> care of it myself later this week since you don't seem to agree with
> that approach.
No, doing dummy write in the generic function is a good idea. This
will benefit for all passthough devices. Dummy write is essential
regardless of stopping DMA or not during migration unless hardware
supports the DMA tracking.
>
>>>
>>> The question is how we would go about triggering it. I really don't
>>> think the PCI configuration space approach is the right idea.
>>> I wonder
>>> if we couldn't get away with some sort of ACPI event instead. We
>>> already require ACPI support in order to shut down the system
>>> gracefully, I wonder if we couldn't get away with something similar in
>>> order to suspend/resume the direct assigned devices gracefully.
>>>
>>
>> I don't think there is such events in the current spec.
>> Otherwise, There are two kinds of suspend/resume callbacks.
>> 1) System suspend/resume called during S2RAM and S2DISK.
>> 2) Runtime suspend/resume called by pm core when device is idle.
>> If you want to do what you mentioned, you have to change PM core and
>> ACPI spec.
>
> The thought I had was to somehow try to move the direct assigned
> devices into their own power domain and then simulate a AC power event
> where that domain is switched off. However I don't know if there are
> ACPI events to support that since the power domain code currently only
> appears to be in use for runtime power management.
This is my concern that how to suspend the passthough device. PM
callback only works during system pm(S3, S4) and runtime pm. You
have to add some codes in the PM core and PCI bus driver to do something
like force suspend when get migration event.
So far, I know GFX device will register callback on the AC power event
and change backlight when AC is plugged or unplugged.
>
> That had also given me the thought to look at something like runtime
> power management for the VFs. We would need to do a runtime
> suspend/resume. The only problem is I don't know if there is any way
> to get the VFs to do a quick wakeup. It might be worthwhile looking
> at trying to check with the ACPI experts out there to see if there is
> anything we can do as bypassing having to use the configuration space
> mechanism to signal this would definitely be worth it.
>
Currently the PCI configuration space is to share migration status and
device information. Notify is done by injecting device irq. If we can't
safely find free PCI configure space, need to find other place to store
these info.
If you just need to wake up a PCI device, PME maybe help.
On Sat, Dec 05, 2015 at 12:32:00AM +0800, Lan, Tianyu wrote:
> Hi Michael & Alexander:
> Thanks a lot for your comments and suggestions.
It's nice that it's appreciated, but you then go on and ignore
all that I have written here:
https://www.mail-archive.com/[email protected]/msg123826.html
> We still need to support Windows guest for migration and this is why our
> patches keep all changes in the driver since it's impossible to change
> Windows kernel.
This is not a reasonable argument. It makes no sense to duplicate code
on Linux because you must duplicate code on Windows. Let's assume you
must do it in the driver on windows because windows has closed source
drivers. What does it matter? Linux can still do it as part of DMA API
and have it apply to all drivers.
> Following is my idea to do DMA tracking.
>
> Inject event to VF driver after memory iterate stage
> and before stop VCPU and then VF driver marks dirty all
> using DMA memory. The new allocated pages also need to
> be marked dirty before stopping VCPU. All dirty memory
> in this time slot will be migrated until stop-and-copy
> stage. We also need to make sure to disable VF via clearing the
> bus master enable bit for VF before migrating these memory.
>
> The dma page allocated by VF driver also needs to reserve space
> to do dummy write.
I suggested ways to do it all in the hypervisor without driver hacks, or
hide it within DMA API without need to reserve extra space. Both
approaches seem much cleaner.
--
MST
On 12/9/2015 6:37 PM, Michael S. Tsirkin wrote:
> On Sat, Dec 05, 2015 at 12:32:00AM +0800, Lan, Tianyu wrote:
>> Hi Michael & Alexander:
>> Thanks a lot for your comments and suggestions.
>
> It's nice that it's appreciated, but you then go on and ignore
> all that I have written here:
> https://www.mail-archive.com/[email protected]/msg123826.html
>
No, I will reply it separately and according your suggestion to snip it
into 3 thread.
>> We still need to support Windows guest for migration and this is why our
>> patches keep all changes in the driver since it's impossible to change
>> Windows kernel.
>
> This is not a reasonable argument. It makes no sense to duplicate code
> on Linux because you must duplicate code on Windows. Let's assume you
> must do it in the driver on windows because windows has closed source
> drivers. What does it matter? Linux can still do it as part of DMA API
> and have it apply to all drivers.
>
Sure. Duplicated code should be encapsulated and make it able to reuse
by other drivers. Just like you said the dummy write part.
I meant the framework should not require to change Windows kernel code
(such as PM core or PCI bus driver)and this will block implementation on
the Windows.
I think it's not problem to duplicate code in the Windows drivers.
>> Following is my idea to do DMA tracking.
>>
>> Inject event to VF driver after memory iterate stage
>> and before stop VCPU and then VF driver marks dirty all
>> using DMA memory. The new allocated pages also need to
>> be marked dirty before stopping VCPU. All dirty memory
>> in this time slot will be migrated until stop-and-copy
>> stage. We also need to make sure to disable VF via clearing the
>> bus master enable bit for VF before migrating these memory.
>>
>> The dma page allocated by VF driver also needs to reserve space
>> to do dummy write.
>
> I suggested ways to do it all in the hypervisor without driver hacks, or
> hide it within DMA API without need to reserve extra space. Both
> approaches seem much cleaner.
>
This sounds reasonable. We may discuss it detail in the separate thread.
On Wed, Dec 09, 2015 at 07:19:15PM +0800, Lan, Tianyu wrote:
> On 12/9/2015 6:37 PM, Michael S. Tsirkin wrote:
> >On Sat, Dec 05, 2015 at 12:32:00AM +0800, Lan, Tianyu wrote:
> >>Hi Michael & Alexander:
> >>Thanks a lot for your comments and suggestions.
> >
> >It's nice that it's appreciated, but you then go on and ignore
> >all that I have written here:
> >https://www.mail-archive.com/[email protected]/msg123826.html
> >
>
> No, I will reply it separately and according your suggestion to snip it into
> 3 thread.
>
> >>We still need to support Windows guest for migration and this is why our
> >>patches keep all changes in the driver since it's impossible to change
> >>Windows kernel.
> >
> >This is not a reasonable argument. It makes no sense to duplicate code
> >on Linux because you must duplicate code on Windows. Let's assume you
> >must do it in the driver on windows because windows has closed source
> >drivers. What does it matter? Linux can still do it as part of DMA API
> >and have it apply to all drivers.
> >
>
> Sure. Duplicated code should be encapsulated and make it able to reuse
> by other drivers. Just like you said the dummy write part.
>
> I meant the framework should not require to change Windows kernel code
> (such as PM core or PCI bus driver)and this will block implementation on
> the Windows.
I remember reading that it's possible to implement a bus driver
on windows if required. But basically I don't see how windows can be
relevant to discussing guest driver patches. That discussion
probably belongs on the qemu maling list, not on lkml.
> I think it's not problem to duplicate code in the Windows drivers.
>
> >>Following is my idea to do DMA tracking.
> >>
> >>Inject event to VF driver after memory iterate stage
> >>and before stop VCPU and then VF driver marks dirty all
> >>using DMA memory. The new allocated pages also need to
> >>be marked dirty before stopping VCPU. All dirty memory
> >>in this time slot will be migrated until stop-and-copy
> >>stage. We also need to make sure to disable VF via clearing the
> >>bus master enable bit for VF before migrating these memory.
> >>
> >>The dma page allocated by VF driver also needs to reserve space
> >>to do dummy write.
> >
> >I suggested ways to do it all in the hypervisor without driver hacks, or
> >hide it within DMA API without need to reserve extra space. Both
> >approaches seem much cleaner.
> >
>
> This sounds reasonable. We may discuss it detail in the separate thread.
On 12/9/2015 7:28 PM, Michael S. Tsirkin wrote:
> I remember reading that it's possible to implement a bus driver
> on windows if required. But basically I don't see how windows can be
> relevant to discussing guest driver patches. That discussion
> probably belongs on the qemu maling list, not on lkml.
I am not sure whether we can write a bus driver for Windows to support
migration. But I think device vendor who want to support migration will
improve their driver if we provide such framework in the
hypervisor which just need them to change their driver.
On Wed, Dec 9, 2015 at 1:28 AM, Lan, Tianyu <[email protected]> wrote:
>
>
> On 12/8/2015 1:12 AM, Alexander Duyck wrote:
>>
>> On Mon, Dec 7, 2015 at 7:40 AM, Lan, Tianyu <[email protected]> wrote:
>>>
>>> On 12/5/2015 1:07 AM, Alexander Duyck wrote:
>>>>>
>>>>>
>>>>>
>>>>> We still need to support Windows guest for migration and this is why
>>>>> our
>>>>> patches keep all changes in the driver since it's impossible to change
>>>>> Windows kernel.
>>>>
>>>>
>>>>
>>>> That is a poor argument. I highly doubt Microsoft is interested in
>>>> having to modify all of the drivers that will support direct assignment
>>>> in order to support migration. They would likely request something
>>>> similar to what I have in that they will want a way to do DMA tracking
>>>> with minimal modification required to the drivers.
>>>
>>>
>>>
>>> This totally depends on the NIC or other devices' vendors and they
>>> should make decision to support migration or not. If yes, they would
>>> modify driver.
>>
>>
>> Having to modify every driver that wants to support live migration is
>> a bit much. In addition I don't see this being limited only to NIC
>> devices. You can direct assign a number of different devices, your
>> solution cannot be specific to NICs.
>
>
> We are also adding such migration support for QAT device and so our
> solution will not just be limit to NIC. Now just is the beginning.
Agreed, but still QAT is networking related. My advice would be to
look at something else that works from within a different subsystem
such as storage. All I am saying is that your solution is very
networking centric.
> We can't limit user to only use Linux guest. So the migration feature
> should work for both Windows and Linux guest.
Right now what your solution is doing is to limit things so that only
the Intel NICs can support this since it will require driver
modification across the board. Instead what I have proposed should
make it so that once you have done the work there should be very
little work that has to be done on your port to support any device.
>>
>>> If just target to call suspend/resume during migration, the feature will
>>> be meaningless. Most cases don't want to affect user during migration
>>> a lot and so the service down time is vital. Our target is to apply
>>> SRIOV NIC passthough to cloud service and NFV(network functions
>>> virtualization) projects which are sensitive to network performance
>>> and stability. From my opinion, We should give a change for device
>>> driver to implement itself migration job. Call suspend and resume
>>> callback in the driver if it doesn't care the performance during
>>> migration.
>>
>>
>> The suspend/resume callback should be efficient in terms of time.
>> After all we don't want the system to stall for a long period of time
>> when it should be either running or asleep. Having it burn cycles in
>> a power state limbo doesn't do anyone any good. If nothing else maybe
>> it will help to push the vendors to speed up those functions which
>> then benefit migration and the system sleep states.
>
>
> If we can benefit both migration and suspend, that would be wonderful.
> But migration and system pm is still different. Just for example,
> driver doesn't need to put device into deep D-status during migration
> and host can do this after migration while it's essential for
> system sleep. PCI configure space and interrupt config is emulated by
> Qemu and Qemu can migrate these configures to new machine. Driver
> doesn't need to deal with such thing. So I think migration still needs a
> different callback or different code path than device suspend/resume.
SR-IOV devices are considered to be in D3 as soon as you clear the bus
master enable bit. They don't actually have a PCIe power management
block in their configuration space. The advantage of the
suspend/resume approach is that the D0->D3->D0 series of transitions
should trigger a PCIe reset on the device. As such the resume call is
capable of fully reinitializing a device.
As far as migrating the interrupts themselves moving live interrupts
is problematic. You are more likely to throw them out of sync since
the state of the device will not match the state of what you migrated
for things like the pending bit array so if there is a device that
actually depending on those bits you might run into issues.
> Another concern is that we have to rework PM core ore PCI bus driver
> to call suspend/resume for passthrough devices during migration. This
> also blocks new feature works on the Windows.
If I am not mistaken the Windows drivers have a similar feature that
is called when you disable or enable an interface. I believe the
motivation for using D3 when a device has been disabled is to save
power on the system since in D3 the device should be in its lowest
power state.
>>
>> Also you keep assuming you can keep the device running while you do
>> the migration and you can't. You are going to corrupt the memory if
>> you do, and you have yet to provide any means to explain how you are
>> going to solve that.
>
>
>
> The main problem is tracking DMA issue. I will repose my solution in the
> new thread for discussion. If not way to mark DMA page dirty when
> DMA is enabled, we have to stop DMA for a small time to do that at the
> last stage.
Correct. We have to stop the device before we lose the ability to
track DMA completions. So once the driver is disabled and has cleared
the mappings only then can we complete the migration.
>>>>> Following is my idea to do DMA tracking.
>>>>>
>>>>> Inject event to VF driver after memory iterate stage
>>>>> and before stop VCPU and then VF driver marks dirty all
>>>>> using DMA memory. The new allocated pages also need to
>>>>> be marked dirty before stopping VCPU. All dirty memory
>>>>> in this time slot will be migrated until stop-and-copy
>>>>> stage. We also need to make sure to disable VF via clearing the
>>>>> bus master enable bit for VF before migrating these memory.
>>>>
>>>>
>>>>
>>>> The ordering of your explanation here doesn't quite work. What needs to
>>>> happen is that you have to disable DMA and then mark the pages as dirty.
>>>> What the disabling of the BME does is signal to the hypervisor that
>>>> the device is now stopped. The ixgbevf_suspend call already supported
>>>> by the driver is almost exactly what is needed to take care of something
>>>> like this.
>>>
>>>
>>>
>>> This is why I hope to reserve a piece of space in the dma page to do
>>> dummy
>>> write. This can help to mark page dirty while not require to stop DMA and
>>> not race with DMA data.
>>
>>
>> You can't and it will still race. What concerns me is that your
>> patches and the document you referenced earlier show a considerable
>> lack of understanding about how DMA and device drivers work. There is
>> a reason why device drivers have so many memory barriers and the like
>> in them. The fact is when you have CPU and a device both accessing
>> memory things have to be done in a very specific order and you cannot
>> violate that.
>>
>> If you have a contiguous block of memory you expect the device to
>> write into you cannot just poke a hole in it. Such a situation is not
>> supported by any hardware that I am aware of.
>>
>> As far as writing to dirty the pages it only works so long as you halt
>> the DMA and then mark the pages dirty. It has to be in that order.
>> Any other order will result in data corruption and I am sure the NFV
>> customers definitely don't want that.
>>
>>> If can't do that, we have to stop DMA in a short time to mark all dma
>>> pages dirty and then reenable it. I am not sure how much we can get by
>>> this way to track all DMA memory with device running during migration. I
>>> need to do some tests and compare results with stop DMA diretly at last
>>> stage during migration.
>>
>>
>> We have to halt the DMA before we can complete the migration. So
>> please feel free to test this.
>
>
> If we can inject interrupt to notify driver just before stopping VCPU
> and then stop DMA, it will not affect service down time a lot since the
> network still will be down when stop VCPU.
The key bit is that you must have proper page tracking. So long as
the DMA is stopped, and then the pages are flagged as dirty it should
be safe. If you just flag the pages as dirty and hope the device is
done you are going to corrupt system memory due to the fact that you
will race between the device and the memory copy routine.
> So the question will be converted to how and when notify device driver about
> migration status.
The device needs to be notified before the stop and halt, and when you
notify the device it has to disable DMA so that it will quit dirtying
pages so you don't race with the final copy operation.
>>
>> In addition I still feel you would be better off taking this in
>> smaller steps. I still say your first step would be to come up with a
>> generic solution for the dirty page tracking like the dma_mark_clean()
>> approach I had mentioned earlier. If I get time I might try to take
>> care of it myself later this week since you don't seem to agree with
>> that approach.
>
>
> No, doing dummy write in the generic function is a good idea. This
> will benefit for all passthough devices. Dummy write is essential
> regardless of stopping DMA or not during migration unless hardware
> supports the DMA tracking.
Okay so we are agreed on that.
>>
>>>>
>>>> The question is how we would go about triggering it. I really don't
>>>> think the PCI configuration space approach is the right idea.
>>>> I wonder
>>>> if we couldn't get away with some sort of ACPI event instead. We
>>>> already require ACPI support in order to shut down the system
>>>> gracefully, I wonder if we couldn't get away with something similar in
>>>> order to suspend/resume the direct assigned devices gracefully.
>>>>
>>>
>>> I don't think there is such events in the current spec.
>>> Otherwise, There are two kinds of suspend/resume callbacks.
>>> 1) System suspend/resume called during S2RAM and S2DISK.
>>> 2) Runtime suspend/resume called by pm core when device is idle.
>>> If you want to do what you mentioned, you have to change PM core and
>>> ACPI spec.
>>
>>
>> The thought I had was to somehow try to move the direct assigned
>> devices into their own power domain and then simulate a AC power event
>> where that domain is switched off. However I don't know if there are
>> ACPI events to support that since the power domain code currently only
>> appears to be in use for runtime power management.
>
>
> This is my concern that how to suspend the passthough device. PM
> callback only works during system pm(S3, S4) and runtime pm. You
> have to add some codes in the PM core and PCI bus driver to do something
> like force suspend when get migration event.
>
> So far, I know GFX device will register callback on the AC power event and
> change backlight when AC is plugged or unplugged.
Basically it all comes down to what we want to emulate. In my mind
the way I see this working is that we essentially could think of the
direct-assigned devices existing in a separate power domain contained
in something like an external PCIe enclosure. This means that they
have their own power supply and clocks and operate semi-autonomously
from the rest of the guest. My thought was to try and find out how
external PCIe or thunderbolt enclosures work, but as it turns out most
of them don't support powering down or suspending the external
enclosure while the system is in use. As such it doesn't look like
there are any good examples in the real world of the kind of behavior
we would want to emulate. That pretty much just leaves hot-plug as
the only solution for now.
>>
>> That had also given me the thought to look at something like runtime
>> power management for the VFs. We would need to do a runtime
>> suspend/resume. The only problem is I don't know if there is any way
>> to get the VFs to do a quick wakeup. It might be worthwhile looking
>> at trying to check with the ACPI experts out there to see if there is
>> anything we can do as bypassing having to use the configuration space
>> mechanism to signal this would definitely be worth it.
>>
>
> Currently the PCI configuration space is to share migration status and
> device information. Notify is done by injecting device irq. If we can't
> safely find free PCI configure space, need to find other place to store
> these info.
>
> If you just need to wake up a PCI device, PME maybe help.
Another thing you might want to look at would be to move the
configuration space away from the device and instead place it
somewhere that is a bit more centrally located. For example what
would happen if you were to instead add the functionality to the
downstream ports on the PCI/PCIe components in Qemu? That way you
only have to modify the configuration space on a few emulated devices
instead of having to modify it for every device that could be direct
assigned. In addition then all it requires is modifying the port
drivers to register the hooks and they in turn could call into the
device driver to take care of suspending or resuming the devices
attached to the downstream port.