From: Longfang Liu <[email protected]>
VMs assigned with HiSilicon ACC VF devices can now perform
live migration if the VF devices are bind to the hisi_acc_vfio_pci
driver.
Signed-off-by: Longfang Liu <[email protected]>
Signed-off-by: Shameer Kolothum <[email protected]>
---
drivers/vfio/pci/hisilicon/Kconfig | 7 +
.../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 1083 ++++++++++++++++-
.../vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 112 ++
3 files changed, 1184 insertions(+), 18 deletions(-)
create mode 100644 drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
diff --git a/drivers/vfio/pci/hisilicon/Kconfig b/drivers/vfio/pci/hisilicon/Kconfig
index d5acaf74a878..02811364a7a7 100644
--- a/drivers/vfio/pci/hisilicon/Kconfig
+++ b/drivers/vfio/pci/hisilicon/Kconfig
@@ -2,6 +2,13 @@
config HISI_ACC_VFIO_PCI
tristate "VFIO PCI support for HiSilicon ACC devices"
depends on (ARM64 && VFIO_PCI_CORE) || (COMPILE_TEST && 64BIT)
+ depends on PCI && PCI_MSI
+ depends on UACCE || UACCE=n
+ depends on ACPI
+ select CRYPTO_DEV_HISI_QM
+ select CRYPTO_DEV_HISI_HPRE
+ select CRYPTO_DEV_HISI_SEC2
+ select CRYPTO_DEV_HISI_ZIP
help
This provides generic PCI support for HiSilicon ACC devices
using the VFIO framework.
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index 582ee4fa4109..ce57c230d1a0 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -12,6 +12,970 @@
#include <linux/pci.h>
#include <linux/vfio.h>
#include <linux/vfio_pci_core.h>
+#include <linux/anon_inodes.h>
+
+#include "hisi_acc_vfio_pci.h"
+
+/* return 0 on VM acc device ready, -ETIMEDOUT hardware timeout */
+static int qm_wait_dev_not_ready(struct hisi_qm *qm)
+{
+ u32 val;
+
+ return readl_relaxed_poll_timeout(qm->io_base + QM_VF_STATE,
+ val, !(val & 0x1), MB_POLL_PERIOD_US,
+ MB_POLL_TIMEOUT_US);
+}
+
+/*
+ * Each state Reg is checked 100 times,
+ * with a delay of 100 microseconds after each check
+ */
+static u32 acc_check_reg_state(struct hisi_qm *qm, u32 regs)
+{
+ int check_times = 0;
+ u32 state;
+
+ state = readl(qm->io_base + regs);
+ while (state && check_times < ERROR_CHECK_TIMEOUT) {
+ udelay(CHECK_DELAY_TIME);
+ state = readl(qm->io_base + regs);
+ check_times++;
+ }
+
+ return state;
+}
+
+/* Check the PF's RAS state and Function INT state */
+static int qm_check_int_state(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+ struct hisi_qm *vfqm = &hisi_acc_vdev->vf_qm;
+ struct hisi_qm *qm = hisi_acc_vdev->pf_qm;
+ struct pci_dev *vf_pdev = hisi_acc_vdev->vf_dev;
+ struct device *dev = &qm->pdev->dev;
+ u32 state;
+
+ /* Check RAS state */
+ state = acc_check_reg_state(qm, QM_ABNORMAL_INT_STATUS);
+ if (state) {
+ dev_err(dev, "failed to check QM RAS state!\n");
+ return -EBUSY;
+ }
+
+ /* Check Function Communication state between PF and VF */
+ state = acc_check_reg_state(vfqm, QM_IFC_INT_STATUS);
+ if (state) {
+ dev_err(dev, "failed to check QM IFC INT state!\n");
+ return -EBUSY;
+ }
+ state = acc_check_reg_state(vfqm, QM_IFC_INT_SET_V);
+ if (state) {
+ dev_err(dev, "failed to check QM IFC INT SET state!\n");
+ return -EBUSY;
+ }
+
+ /* Check submodule task state */
+ switch (vf_pdev->device) {
+ case PCI_DEVICE_ID_HUAWEI_SEC_VF:
+ state = acc_check_reg_state(qm, SEC_CORE_INT_STATUS);
+ if (state) {
+ dev_err(dev, "failed to check QM SEC Core INT state!\n");
+ return -EBUSY;
+ }
+ return 0;
+ case PCI_DEVICE_ID_HUAWEI_HPRE_VF:
+ state = acc_check_reg_state(qm, HPRE_HAC_INT_STATUS);
+ if (state) {
+ dev_err(dev, "failed to check QM HPRE HAC INT state!\n");
+ return -EBUSY;
+ }
+ return 0;
+ case PCI_DEVICE_ID_HUAWEI_ZIP_VF:
+ state = acc_check_reg_state(qm, HZIP_CORE_INT_STATUS);
+ if (state) {
+ dev_err(dev, "failed to check QM ZIP Core INT state!\n");
+ return -EBUSY;
+ }
+ return 0;
+ default:
+ dev_err(dev, "failed to detect acc module type!\n");
+ return -EINVAL;
+ }
+}
+
+static int qm_read_reg(struct hisi_qm *qm, u32 reg_addr,
+ u32 *data, u8 nums)
+{
+ int i;
+
+ if (nums < 1 || nums > QM_REGS_MAX_LEN)
+ return -EINVAL;
+
+ for (i = 0; i < nums; i++) {
+ data[i] = readl(qm->io_base + reg_addr);
+ reg_addr += QM_REG_ADDR_OFFSET;
+ }
+
+ return 0;
+}
+
+static int qm_write_reg(struct hisi_qm *qm, u32 reg,
+ u32 *data, u8 nums)
+{
+ int i;
+
+ if (nums < 1 || nums > QM_REGS_MAX_LEN)
+ return -EINVAL;
+
+ for (i = 0; i < nums; i++)
+ writel(data[i], qm->io_base + reg + i * QM_REG_ADDR_OFFSET);
+
+ return 0;
+}
+
+static int qm_get_vft(struct hisi_qm *qm, u32 *base)
+{
+ u64 sqc_vft;
+ u32 qp_num;
+ int ret;
+
+ ret = qm_mb(qm, QM_MB_CMD_SQC_VFT_V2, 0, 0, 1);
+ if (ret)
+ return ret;
+
+ sqc_vft = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) |
+ ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) <<
+ QM_XQC_ADDR_OFFSET);
+ *base = QM_SQC_VFT_BASE_MASK_V2 & (sqc_vft >> QM_SQC_VFT_BASE_SHIFT_V2);
+ qp_num = (QM_SQC_VFT_NUM_MASK_V2 &
+ (sqc_vft >> QM_SQC_VFT_NUM_SHIFT_V2)) + 1;
+
+ return qp_num;
+}
+
+static int qm_get_sqc(struct hisi_qm *qm, u64 *addr)
+{
+ int ret;
+
+ ret = qm_mb(qm, QM_MB_CMD_SQC_BT, 0, 0, 1);
+ if (ret)
+ return ret;
+
+ *addr = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) |
+ ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) <<
+ QM_XQC_ADDR_OFFSET);
+
+ return 0;
+}
+
+static int qm_get_cqc(struct hisi_qm *qm, u64 *addr)
+{
+ int ret;
+
+ ret = qm_mb(qm, QM_MB_CMD_CQC_BT, 0, 0, 1);
+ if (ret)
+ return ret;
+
+ *addr = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) |
+ ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) <<
+ QM_XQC_ADDR_OFFSET);
+
+ return 0;
+}
+
+static int qm_rw_regs_read(struct hisi_qm *qm, struct acc_vf_data *vf_data)
+{
+ struct device *dev = &qm->pdev->dev;
+ int ret;
+
+ ret = qm_read_reg(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1);
+ if (ret) {
+ dev_err(dev, "failed to read QM_VF_AEQ_INT_MASK\n");
+ return ret;
+ }
+
+ ret = qm_read_reg(qm, QM_VF_EQ_INT_MASK, &vf_data->eq_int_mask, 1);
+ if (ret) {
+ dev_err(dev, "failed to read QM_VF_EQ_INT_MASK\n");
+ return ret;
+ }
+
+ ret = qm_read_reg(qm, QM_IFC_INT_SOURCE_V,
+ &vf_data->ifc_int_source, 1);
+ if (ret) {
+ dev_err(dev, "failed to read QM_IFC_INT_SOURCE_V\n");
+ return ret;
+ }
+
+ ret = qm_read_reg(qm, QM_IFC_INT_MASK, &vf_data->ifc_int_mask, 1);
+ if (ret) {
+ dev_err(dev, "failed to read QM_IFC_INT_MASK\n");
+ return ret;
+ }
+
+ ret = qm_read_reg(qm, QM_IFC_INT_SET_V, &vf_data->ifc_int_set, 1);
+ if (ret) {
+ dev_err(dev, "failed to read QM_IFC_INT_SET_V\n");
+ return ret;
+ }
+
+ ret = qm_read_reg(qm, QM_PAGE_SIZE, &vf_data->page_size, 1);
+ if (ret) {
+ dev_err(dev, "failed to read QM_PAGE_SIZE\n");
+ return ret;
+ }
+
+ /* QM_EQC_DW has 7 regs */
+ ret = qm_read_reg(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7);
+ if (ret) {
+ dev_err(dev, "failed to read QM_EQC_DW\n");
+ return ret;
+ }
+
+ /* QM_AEQC_DW has 7 regs */
+ ret = qm_read_reg(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7);
+ if (ret) {
+ dev_err(dev, "failed to read QM_AEQC_DW\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int qm_rw_regs_write(struct hisi_qm *qm, struct acc_vf_data *vf_data)
+{
+ struct device *dev = &qm->pdev->dev;
+ int ret;
+
+ /* check VF state */
+ if (unlikely(qm_wait_mb_ready(qm))) {
+ dev_err(&qm->pdev->dev, "QM device is not ready to write\n");
+ return -EBUSY;
+ }
+
+ ret = qm_write_reg(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1);
+ if (ret) {
+ dev_err(dev, "failed to write QM_VF_AEQ_INT_MASK\n");
+ return ret;
+ }
+
+ ret = qm_write_reg(qm, QM_VF_EQ_INT_MASK, &vf_data->eq_int_mask, 1);
+ if (ret) {
+ dev_err(dev, "failed to write QM_VF_EQ_INT_MASK\n");
+ return ret;
+ }
+
+ ret = qm_write_reg(qm, QM_IFC_INT_SOURCE_V,
+ &vf_data->ifc_int_source, 1);
+ if (ret) {
+ dev_err(dev, "failed to write QM_IFC_INT_SOURCE_V\n");
+ return ret;
+ }
+
+ ret = qm_write_reg(qm, QM_IFC_INT_MASK, &vf_data->ifc_int_mask, 1);
+ if (ret) {
+ dev_err(dev, "failed to write QM_IFC_INT_MASK\n");
+ return ret;
+ }
+
+ ret = qm_write_reg(qm, QM_IFC_INT_SET_V, &vf_data->ifc_int_set, 1);
+ if (ret) {
+ dev_err(dev, "failed to write QM_IFC_INT_SET_V\n");
+ return ret;
+ }
+
+ ret = qm_write_reg(qm, QM_QUE_ISO_CFG_V, &vf_data->que_iso_cfg, 1);
+ if (ret) {
+ dev_err(dev, "failed to write QM_QUE_ISO_CFG_V\n");
+ return ret;
+ }
+
+ ret = qm_write_reg(qm, QM_PAGE_SIZE, &vf_data->page_size, 1);
+ if (ret) {
+ dev_err(dev, "failed to write QM_PAGE_SIZE\n");
+ return ret;
+ }
+
+ /* QM_EQC_DW has 7 regs */
+ ret = qm_write_reg(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7);
+ if (ret) {
+ dev_err(dev, "failed to write QM_EQC_DW\n");
+ return ret;
+ }
+
+ /* QM_AEQC_DW has 7 regs */
+ ret = qm_write_reg(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7);
+ if (ret) {
+ dev_err(dev, "failed to write QM_AEQC_DW\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static void qm_db(struct hisi_qm *qm, u16 qn, u8 cmd,
+ u16 index, u8 priority)
+{
+ u64 doorbell;
+ u64 dbase;
+ u16 randata = 0;
+
+ if (cmd == QM_DOORBELL_CMD_SQ || cmd == QM_DOORBELL_CMD_CQ)
+ dbase = QM_DOORBELL_SQ_CQ_BASE_V2;
+ else
+ dbase = QM_DOORBELL_EQ_AEQ_BASE_V2;
+
+ doorbell = qn | ((u64)cmd << QM_DB_CMD_SHIFT_V2) |
+ ((u64)randata << QM_DB_RAND_SHIFT_V2) |
+ ((u64)index << QM_DB_INDEX_SHIFT_V2) |
+ ((u64)priority << QM_DB_PRIORITY_SHIFT_V2);
+
+ writeq(doorbell, qm->io_base + dbase);
+}
+
+static int pf_qm_get_qp_num(struct hisi_qm *qm, int vf_id, u32 *rbase)
+{
+ unsigned int val;
+ u64 sqc_vft;
+ u32 qp_num;
+ int ret;
+
+ ret = readl_relaxed_poll_timeout(qm->io_base + QM_VFT_CFG_RDY, val,
+ val & BIT(0), MB_POLL_PERIOD_US,
+ MB_POLL_TIMEOUT_US);
+ if (ret)
+ return ret;
+
+ writel(0x1, qm->io_base + QM_VFT_CFG_OP_WR);
+ /* 0 mean SQC VFT */
+ writel(0x0, qm->io_base + QM_VFT_CFG_TYPE);
+ writel(vf_id, qm->io_base + QM_VFT_CFG);
+
+ writel(0x0, qm->io_base + QM_VFT_CFG_RDY);
+ writel(0x1, qm->io_base + QM_VFT_CFG_OP_ENABLE);
+
+ ret = readl_relaxed_poll_timeout(qm->io_base + QM_VFT_CFG_RDY, val,
+ val & BIT(0), MB_POLL_PERIOD_US,
+ MB_POLL_TIMEOUT_US);
+ if (ret)
+ return ret;
+
+ sqc_vft = readl(qm->io_base + QM_VFT_CFG_DATA_L) |
+ ((u64)readl(qm->io_base + QM_VFT_CFG_DATA_H) <<
+ QM_XQC_ADDR_OFFSET);
+ *rbase = QM_SQC_VFT_BASE_MASK_V2 &
+ (sqc_vft >> QM_SQC_VFT_BASE_SHIFT_V2);
+ qp_num = (QM_SQC_VFT_NUM_MASK_V2 &
+ (sqc_vft >> QM_SQC_VFT_NUM_SHIFT_V2)) + 1;
+
+ return qp_num;
+}
+
+static void qm_dev_cmd_init(struct hisi_qm *qm)
+{
+ /* Clear VF communication status registers. */
+ writel(0x1, qm->io_base + QM_IFC_INT_SOURCE_V);
+
+ /* Enable pf and vf communication. */
+ writel(0x0, qm->io_base + QM_IFC_INT_MASK);
+}
+
+static int vf_qm_cache_wb(struct hisi_qm *qm)
+{
+ unsigned int val;
+
+ writel(0x1, qm->io_base + QM_CACHE_WB_START);
+ if (readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE,
+ val, val & BIT(0), MB_POLL_PERIOD_US,
+ MB_POLL_TIMEOUT_US)) {
+ dev_err(&qm->pdev->dev, "vf QM writeback sqc cache fail\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void vf_qm_fun_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct hisi_qm *qm)
+{
+ int i;
+
+ for (i = 0; i < qm->qp_num; i++)
+ qm_db(qm, i, QM_DOORBELL_CMD_SQ, 0, 1);
+}
+
+static int vf_qm_func_stop(struct hisi_qm *qm)
+{
+ return qm_mb(qm, QM_MB_CMD_PAUSE_QM, 0, 0, 0);
+}
+
+static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct hisi_acc_vf_migration_file *migf)
+{
+ struct acc_vf_data *vf_data = &migf->vf_data;
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm;
+ struct device *dev = &vf_qm->pdev->dev;
+ u32 que_iso_state;
+ int ret;
+
+ if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done)
+ return 0;
+
+ /* vf acc dev type check */
+ if (vf_data->dev_id != hisi_acc_vdev->vf_dev->device) {
+ dev_err(dev, "failed to match VF devices\n");
+ return -EINVAL;
+ }
+
+ /* vf qp num check */
+ ret = qm_get_vft(vf_qm, &vf_qm->qp_base);
+ if (ret <= 0) {
+ dev_err(dev, "failed to get vft qp nums\n");
+ return -EINVAL;
+ }
+
+ if (ret != vf_data->qp_num) {
+ dev_err(dev, "failed to match VF qp num\n");
+ return -EINVAL;
+ }
+
+ vf_qm->qp_num = ret;
+
+ /* vf isolation state check */
+ ret = qm_read_reg(pf_qm, QM_QUE_ISO_CFG_V, &que_iso_state, 1);
+ if (ret) {
+ dev_err(dev, "failed to read QM_QUE_ISO_CFG_V\n");
+ return ret;
+ }
+
+ if (vf_data->que_iso_cfg != que_iso_state) {
+ dev_err(dev, "failed to match isolation state\n");
+ return ret;
+ }
+
+ hisi_acc_vdev->match_done = 1;
+ return 0;
+}
+
+static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct hisi_acc_vf_migration_file *migf)
+{
+ struct acc_vf_data *vf_data = &migf->vf_data;
+ struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm;
+ struct device *dev = &pf_qm->pdev->dev;
+ int vf_id = hisi_acc_vdev->vf_id;
+ int ret;
+
+ /* save device id */
+ vf_data->dev_id = hisi_acc_vdev->vf_dev->device;
+
+ /* vf qp num save from PF */
+ ret = pf_qm_get_qp_num(pf_qm, vf_id, &vf_data->qp_base);
+ if (ret <= 0) {
+ dev_err(dev, "failed to get vft qp nums!\n");
+ return -EINVAL;
+ }
+
+ vf_data->qp_num = ret;
+
+ /* VF isolation state save from PF */
+ ret = qm_read_reg(pf_qm, QM_QUE_ISO_CFG_V, &vf_data->que_iso_cfg, 1);
+ if (ret) {
+ dev_err(dev, "failed to read QM_QUE_ISO_CFG_V!\n");
+ return ret;
+ }
+
+ migf->total_length = QM_MATCH_SIZE;
+
+ return 0;
+}
+
+static int hisi_acc_vf_load_state(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct hisi_acc_vf_migration_file *migf)
+{
+ struct hisi_qm *qm = &hisi_acc_vdev->vf_qm;
+ struct device *dev = &qm->pdev->dev;
+ struct acc_vf_data *vf_data = &migf->vf_data;
+ int ret;
+
+ /* Return if only match data was transferred */
+ if (migf->total_length == QM_MATCH_SIZE) {
+ hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
+ return 0;
+ }
+
+ if (migf->total_length < sizeof(struct acc_vf_data))
+ return -EINVAL;
+
+ qm->eqe_dma = vf_data->eqe_dma;
+ qm->aeqe_dma = vf_data->aeqe_dma;
+ qm->sqc_dma = vf_data->sqc_dma;
+ qm->cqc_dma = vf_data->cqc_dma;
+
+ qm->qp_base = vf_data->qp_base;
+ qm->qp_num = vf_data->qp_num;
+
+ ret = qm_rw_regs_write(qm, vf_data);
+ if (ret) {
+ dev_err(dev, "Set VF regs failed\n");
+ return ret;
+ }
+
+ ret = qm_mb(qm, QM_MB_CMD_SQC_BT, qm->sqc_dma, 0, 0);
+ if (ret) {
+ dev_err(dev, "Set sqc failed\n");
+ return ret;
+ }
+
+ ret = qm_mb(qm, QM_MB_CMD_CQC_BT, qm->cqc_dma, 0, 0);
+ if (ret) {
+ dev_err(dev, "Set cqc failed\n");
+ return ret;
+ }
+
+ qm_dev_cmd_init(qm);
+ return 0;
+}
+
+static void hisi_acc_vf_disable_fd(struct hisi_acc_vf_migration_file *migf)
+{
+ mutex_lock(&migf->lock);
+ migf->disabled = true;
+ migf->total_length = 0;
+ migf->filp->f_pos = 0;
+ mutex_unlock(&migf->lock);
+}
+
+static int hisi_acc_vf_release_file(struct inode *inode, struct file *filp)
+{
+ struct hisi_acc_vf_migration_file *migf = filp->private_data;
+
+ hisi_acc_vf_disable_fd(migf);
+ mutex_destroy(&migf->lock);
+ kfree(migf);
+ return 0;
+}
+
+static ssize_t hisi_acc_vf_resume_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct hisi_acc_vf_migration_file *migf = filp->private_data;
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(migf,
+ struct hisi_acc_vf_core_device, resuming_migf);
+ loff_t requested_length;
+ u8 *data = (u8 *)&migf->vf_data;
+ ssize_t done = 0;
+ int ret;
+
+ if (pos)
+ return -ESPIPE;
+ pos = &filp->f_pos;
+
+ if (*pos < 0 ||
+ check_add_overflow((loff_t)len, *pos, &requested_length))
+ return -EINVAL;
+
+ if (requested_length > sizeof(struct acc_vf_data))
+ return -ENOMEM;
+
+ mutex_lock(&migf->lock);
+ if (migf->disabled) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ ret = copy_from_user(data + *pos, buf, len);
+ if (ret) {
+ done = -EFAULT;
+ goto out_unlock;
+ }
+ *pos += len;
+ done = len;
+ migf->total_length += len;
+
+ ret = vf_qm_check_match(hisi_acc_vdev, migf);
+ if (ret)
+ done = -EFAULT;
+
+out_unlock:
+ mutex_unlock(&migf->lock);
+ return done;
+}
+
+static const struct file_operations hisi_acc_vf_resume_fops = {
+ .owner = THIS_MODULE,
+ .write = hisi_acc_vf_resume_write,
+ .release = hisi_acc_vf_release_file,
+ .llseek = no_llseek,
+};
+
+static int hisi_acc_vf_pci_resume(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct hisi_acc_vf_migration_file *migf)
+{
+ migf->filp = anon_inode_getfile("hisi_acc_vf_mig", &hisi_acc_vf_resume_fops, migf,
+ O_WRONLY);
+ if (IS_ERR(migf->filp)) {
+ int err = PTR_ERR(migf->filp);
+
+ return err;
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ return 0;
+}
+
+static int hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct hisi_acc_vf_migration_file *migf)
+{
+ struct acc_vf_data *vf_data = &migf->vf_data;
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ struct device *dev = &vf_qm->pdev->dev;
+ int ret;
+
+ if (unlikely(qm_wait_dev_not_ready(vf_qm))) {
+ /* Update state and return. */
+ hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
+ return 0;
+ }
+
+ ret = vf_qm_cache_wb(vf_qm);
+ if (ret) {
+ dev_err(dev, "failed to writeback QM Cache!\n");
+ return ret;
+ }
+
+ ret = qm_rw_regs_read(vf_qm, vf_data);
+ if (ret)
+ return -EINVAL;
+
+ /* Every reg is 32 bit, the dma address is 64 bit. */
+ vf_data->eqe_dma = vf_data->qm_eqc_dw[2];
+ vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET;
+ vf_data->eqe_dma |= vf_data->qm_eqc_dw[1];
+ vf_data->aeqe_dma = vf_data->qm_aeqc_dw[2];
+ vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET;
+ vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[1];
+
+ /* Through SQC_BT/CQC_BT to get sqc and cqc address */
+ ret = qm_get_sqc(vf_qm, &vf_data->sqc_dma);
+ if (ret) {
+ dev_err(dev, "failed to read SQC addr!\n");
+ return -EINVAL;
+ }
+
+ ret = qm_get_cqc(vf_qm, &vf_data->cqc_dma);
+ if (ret) {
+ dev_err(dev, "failed to read CQC addr!\n");
+ return -EINVAL;
+ }
+
+ migf->total_length = sizeof(struct acc_vf_data);
+ return 0;
+}
+
+static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+ struct device *dev = &hisi_acc_vdev->vf_dev->dev;
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ int ret;
+
+ ret = vf_qm_func_stop(vf_qm);
+ if (ret) {
+ dev_err(dev, "failed to stop QM VF function!\n");
+ return ret;
+ }
+
+ ret = qm_check_int_state(hisi_acc_vdev);
+ if (ret) {
+ dev_err(dev, "failed to check QM INT state!\n");
+ return ret;
+ }
+ return 0;
+}
+
+static void hisi_acc_vf_start_device(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+
+ if (hisi_acc_vdev->vf_qm_state != QM_READY)
+ return;
+
+ vf_qm_fun_reset(hisi_acc_vdev, vf_qm);
+}
+
+static ssize_t hisi_acc_vf_save_read(struct file *filp, char __user *buf, size_t len,
+ loff_t *pos)
+{
+ struct hisi_acc_vf_migration_file *migf = filp->private_data;
+ ssize_t done = 0;
+ int ret;
+
+ if (pos)
+ return -ESPIPE;
+ pos = &filp->f_pos;
+
+ mutex_lock(&migf->lock);
+ if (*pos > migf->total_length) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (migf->disabled) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ len = min_t(size_t, migf->total_length - *pos, len);
+ if (len) {
+ u8 *data = (u8 *)&migf->vf_data;
+
+ ret = copy_to_user(buf, data + *pos, len);
+ if (ret) {
+ done = -EFAULT;
+ goto out_unlock;
+ }
+ *pos += len;
+ done = len;
+ }
+out_unlock:
+ mutex_unlock(&migf->lock);
+ return done;
+}
+
+static const struct file_operations hisi_acc_vf_save_fops = {
+ .owner = THIS_MODULE,
+ .read = hisi_acc_vf_save_read,
+ .release = hisi_acc_vf_release_file,
+ .llseek = no_llseek,
+};
+
+static int hisi_acc_vf_pre_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct hisi_acc_vf_migration_file *migf)
+{
+ int ret;
+
+ migf->filp = anon_inode_getfile("hisi_acc_vf_mig", &hisi_acc_vf_save_fops, migf,
+ O_RDONLY);
+ if (IS_ERR(migf->filp)) {
+ int err = PTR_ERR(migf->filp);
+
+ return err;
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+
+ ret = vf_qm_get_match_data(hisi_acc_vdev, migf);
+ if (ret) {
+ fput(migf->filp);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void hisi_acc_vf_disable_fds(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+ struct hisi_acc_vf_migration_file *resuming_migf = &hisi_acc_vdev->resuming_migf;
+ struct hisi_acc_vf_migration_file *saving_migf = &hisi_acc_vdev->saving_migf;
+
+ if (resuming_migf->filp) {
+ hisi_acc_vf_disable_fd(resuming_migf);
+ fput(resuming_migf->filp);
+ }
+
+ if (saving_migf->filp) {
+ hisi_acc_vf_disable_fd(saving_migf);
+ fput(saving_migf->filp);
+ }
+}
+
+static struct file *
+hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ u32 new)
+{
+ u32 cur = hisi_acc_vdev->mig_state;
+ int ret;
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) {
+ struct hisi_acc_vf_migration_file *migf = &hisi_acc_vdev->saving_migf;
+
+ ret = hisi_acc_vf_pre_copy(hisi_acc_vdev, migf);
+ if (ret)
+ return ERR_PTR(ret);
+ get_file(migf->filp);
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ struct hisi_acc_vf_migration_file *migf = &hisi_acc_vdev->saving_migf;
+
+ ret = hisi_acc_vf_stop_device(hisi_acc_vdev);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = hisi_acc_vf_stop_copy(hisi_acc_vdev, migf);
+ if (ret)
+ return ERR_PTR(ret);
+ get_file(migf->filp);
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_STOP) {
+ ret = hisi_acc_vf_stop_device(hisi_acc_vdev);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) {
+ hisi_acc_vf_disable_fds(hisi_acc_vdev);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+ struct hisi_acc_vf_migration_file *migf = &hisi_acc_vdev->resuming_migf;
+
+ ret = hisi_acc_vf_pci_resume(hisi_acc_vdev, migf);
+ if (ret)
+ return ERR_PTR(ret);
+ get_file(migf->filp);
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+ struct hisi_acc_vf_migration_file *migf = &hisi_acc_vdev->resuming_migf;
+
+ ret = hisi_acc_vf_load_state(hisi_acc_vdev, migf);
+ if (ret)
+ return ERR_PTR(ret);
+ hisi_acc_vf_disable_fds(hisi_acc_vdev);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_STOP || cur == VFIO_DEVICE_STATE_PRE_COPY) &&
+ new == VFIO_DEVICE_STATE_RUNNING) {
+ hisi_acc_vf_start_device(hisi_acc_vdev);
+ return NULL;
+ }
+
+ /*
+ * vfio_mig_get_next_state() does not use arcs other than the above
+ */
+
+ WARN_ON(true);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct file *
+hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev,
+ struct hisi_acc_vf_core_device, core_device.vdev);
+ enum vfio_device_mig_state next_state;
+ struct file *res = NULL;
+ int ret;
+
+ mutex_lock(&hisi_acc_vdev->state_mutex);
+ while (new_state != hisi_acc_vdev->mig_state) {
+ ret = vfio_mig_get_next_state(vdev,
+ hisi_acc_vdev->mig_state,
+ new_state, &next_state);
+ if (ret) {
+ res = ERR_PTR(-EINVAL);
+ break;
+ }
+
+ res = hisi_acc_vf_set_device_state(hisi_acc_vdev, next_state);
+ if (IS_ERR(res))
+ break;
+ hisi_acc_vdev->mig_state = next_state;
+ if (WARN_ON(res && new_state != hisi_acc_vdev->mig_state)) {
+ fput(res);
+ res = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
+ return res;
+}
+
+static int
+hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state *curr_state)
+{
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev,
+ struct hisi_acc_vf_core_device, core_device.vdev);
+
+ mutex_lock(&hisi_acc_vdev->state_mutex);
+ *curr_state = hisi_acc_vdev->mig_state;
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
+ return 0;
+}
+
+static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+ struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device;
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ struct pci_dev *vf_dev = vdev->pdev;
+
+ /*
+ * ACC VF dev BAR2 region consists of both functional register space
+ * and migration control register space. For migration to work, we
+ * need access to both. Hence, we map the entire BAR2 region here.
+ * But from a security point of view, we restrict access to the
+ * migration control space from Guest(Please see mmap/ioctl/read/write
+ * override functions).
+ *
+ * Also the HiSilicon ACC VF devices supported by this driver on
+ * HiSilicon hardware platforms are integrated end point devices
+ * and has no capability to perform PCIe P2P.
+ */
+
+ vf_qm->io_base =
+ ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX),
+ pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX));
+ if (!vf_qm->io_base)
+ return -EIO;
+
+ vf_qm->fun_type = QM_HW_VF;
+ vf_qm->pdev = vf_dev;
+ mutex_init(&vf_qm->mailbox_lock);
+
+ return 0;
+}
+
+static struct hisi_qm *hisi_acc_get_pf_qm(struct pci_dev *pdev)
+{
+ struct hisi_qm *pf_qm;
+ struct pci_driver *pf_driver;
+
+ if (!pdev->is_virtfn)
+ return NULL;
+
+ switch (pdev->device) {
+ case PCI_DEVICE_ID_HUAWEI_SEC_VF:
+ pf_driver = hisi_sec_get_pf_driver();
+ break;
+ case PCI_DEVICE_ID_HUAWEI_HPRE_VF:
+ pf_driver = hisi_hpre_get_pf_driver();
+ break;
+ case PCI_DEVICE_ID_HUAWEI_ZIP_VF:
+ pf_driver = hisi_zip_get_pf_driver();
+ break;
+ default:
+ return NULL;
+ }
+
+ if (!pf_driver)
+ return NULL;
+
+ pf_qm = pci_iov_get_pf_drvdata(pdev, pf_driver);
+
+ return !IS_ERR(pf_qm) ? pf_qm : NULL;
+}
static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev,
size_t count, loff_t *ppos,
@@ -122,29 +1086,72 @@ static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int
return copy_to_user((void __user *)arg, &info, minsz) ?
-EFAULT : 0;
}
+ } else if (cmd == VFIO_DEVICE_MIG_PRECOPY) {
+ struct vfio_device_mig_precopy precopy;
+ enum vfio_device_mig_state curr_state;
+ unsigned long minsz;
+ int ret;
+
+ minsz = offsetofend(struct vfio_device_mig_precopy, dirty_bytes);
+
+ if (copy_from_user(&precopy, (void __user *)arg, minsz))
+ return -EFAULT;
+ if (precopy.argsz < minsz)
+ return -EINVAL;
+
+ ret = hisi_acc_vfio_pci_get_device_state(core_vdev, &curr_state);
+ if (!ret && curr_state == VFIO_DEVICE_STATE_PRE_COPY) {
+ precopy.initial_bytes = QM_MATCH_SIZE;
+ precopy.dirty_bytes = QM_MATCH_SIZE;
+ } else {
+ precopy.initial_bytes = 0;
+ precopy.dirty_bytes = 0;
+ }
+
+ return copy_to_user((void __user *)arg, &precopy, minsz) ?
+ -EFAULT : 0;
}
return vfio_pci_core_ioctl(core_vdev, cmd, arg);
}
static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev)
{
- struct vfio_pci_core_device *vdev =
- container_of(core_vdev, struct vfio_pci_core_device, vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
+ struct hisi_acc_vf_core_device, core_device.vdev);
+ struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device;
int ret;
ret = vfio_pci_core_enable(vdev);
if (ret)
return ret;
- vfio_pci_core_finish_enable(vdev);
+ if (core_vdev->ops->migration_set_state) {
+ ret = hisi_acc_vf_qm_init(hisi_acc_vdev);
+ if (ret) {
+ vfio_pci_core_disable(vdev);
+ return ret;
+ }
+ hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ }
+ vfio_pci_core_finish_enable(vdev);
return 0;
}
+static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev)
+{
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
+ struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+
+ iounmap(vf_qm->io_base);
+ vfio_pci_core_close_device(core_vdev);
+}
+
static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = {
.name = "hisi-acc-vfio-pci-migration",
.open_device = hisi_acc_vfio_pci_open_device,
- .close_device = vfio_pci_core_close_device,
+ .close_device = hisi_acc_vfio_pci_close_device,
.ioctl = hisi_acc_vfio_pci_ioctl,
.device_feature = vfio_pci_core_ioctl_feature,
.read = hisi_acc_vfio_pci_read,
@@ -152,6 +1159,8 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = {
.mmap = hisi_acc_vfio_pci_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .migration_set_state = hisi_acc_vfio_pci_set_device_state,
+ .migration_get_state = hisi_acc_vfio_pci_get_device_state,
};
static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
@@ -167,38 +1176,76 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
.match = vfio_pci_core_match,
};
+static int
+hisi_acc_vfio_pci_migrn_init(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct pci_dev *pdev, struct hisi_qm *pf_qm)
+{
+ int vf_id;
+
+ vf_id = pci_iov_vf_id(pdev);
+ if (vf_id < 0)
+ return vf_id;
+
+ hisi_acc_vdev->vf_id = vf_id + 1;
+ /*
+ * We set _PRE_COPY here for an early check on compatibility between
+ * src and dst devices.
+ */
+ hisi_acc_vdev->core_device.vdev.migration_flags =
+ (VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY);
+ hisi_acc_vdev->pf_qm = pf_qm;
+ hisi_acc_vdev->vf_dev = pdev;
+ mutex_init(&hisi_acc_vdev->state_mutex);
+
+ return 0;
+}
+
static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
- struct vfio_pci_core_device *vdev;
+ struct hisi_acc_vf_core_device *hisi_acc_vdev;
+ struct hisi_qm *pf_qm;
int ret;
- vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
- if (!vdev)
+ hisi_acc_vdev = kzalloc(sizeof(*hisi_acc_vdev), GFP_KERNEL);
+ if (!hisi_acc_vdev)
return -ENOMEM;
- vfio_pci_core_init_device(vdev, pdev, &hisi_acc_vfio_pci_ops);
+ pf_qm = hisi_acc_get_pf_qm(pdev);
+ if (pf_qm && pf_qm->ver >= QM_HW_V3) {
+ ret = hisi_acc_vfio_pci_migrn_init(hisi_acc_vdev, pdev, pf_qm);
+ if (!ret) {
+ vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev,
+ &hisi_acc_vfio_pci_migrn_ops);
+ } else {
+ pci_warn(pdev, "migration support failed, continue with generic interface\n");
+ vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev,
+ &hisi_acc_vfio_pci_ops);
+ }
+ } else {
+ vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev,
+ &hisi_acc_vfio_pci_ops);
+ }
- ret = vfio_pci_core_register_device(vdev);
+ ret = vfio_pci_core_register_device(&hisi_acc_vdev->core_device);
if (ret)
goto out_free;
- dev_set_drvdata(&pdev->dev, vdev);
-
+ dev_set_drvdata(&pdev->dev, hisi_acc_vdev);
return 0;
out_free:
- vfio_pci_core_uninit_device(vdev);
- kfree(vdev);
+ vfio_pci_core_uninit_device(&hisi_acc_vdev->core_device);
+ kfree(hisi_acc_vdev);
return ret;
}
static void hisi_acc_vfio_pci_remove(struct pci_dev *pdev)
{
- struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = dev_get_drvdata(&pdev->dev);
- vfio_pci_core_unregister_device(vdev);
- vfio_pci_core_uninit_device(vdev);
- kfree(vdev);
+ vfio_pci_core_unregister_device(&hisi_acc_vdev->core_device);
+ vfio_pci_core_uninit_device(&hisi_acc_vdev->core_device);
+ kfree(hisi_acc_vdev);
}
static const struct pci_device_id hisi_acc_vfio_pci_table[] = {
@@ -223,4 +1270,4 @@ module_pci_driver(hisi_acc_vfio_pci_driver);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Liu Longfang <[email protected]>");
MODULE_AUTHOR("Shameer Kolothum <[email protected]>");
-MODULE_DESCRIPTION("HiSilicon VFIO PCI - Generic VFIO PCI driver for HiSilicon ACC device family");
+MODULE_DESCRIPTION("HiSilicon VFIO PCI - VFIO PCI driver with live migration support for HiSilicon ACC device family");
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
new file mode 100644
index 000000000000..51bc7e92a776
--- /dev/null
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2021 HiSilicon Ltd. */
+
+#ifndef HISI_ACC_VFIO_PCI_H
+#define HISI_ACC_VFIO_PCI_H
+
+#include <linux/hisi_acc_qm.h>
+
+#define MB_POLL_PERIOD_US 10
+#define MB_POLL_TIMEOUT_US 1000
+#define QM_CACHE_WB_START 0x204
+#define QM_CACHE_WB_DONE 0x208
+#define QM_MB_CMD_PAUSE_QM 0xe
+#define QM_ABNORMAL_INT_STATUS 0x100008
+#define QM_IFC_INT_STATUS 0x0028
+#define SEC_CORE_INT_STATUS 0x301008
+#define HPRE_HAC_INT_STATUS 0x301800
+#define HZIP_CORE_INT_STATUS 0x3010AC
+#define QM_QUE_ISO_CFG 0x301154
+
+#define QM_VFT_CFG_RDY 0x10006c
+#define QM_VFT_CFG_OP_WR 0x100058
+#define QM_VFT_CFG_TYPE 0x10005c
+#define QM_VFT_CFG 0x100060
+#define QM_VFT_CFG_OP_ENABLE 0x100054
+#define QM_VFT_CFG_DATA_L 0x100064
+#define QM_VFT_CFG_DATA_H 0x100068
+
+#define ERROR_CHECK_TIMEOUT 100
+#define CHECK_DELAY_TIME 100
+
+#define QM_SQC_VFT_BASE_SHIFT_V2 28
+#define QM_SQC_VFT_BASE_MASK_V2 GENMASK(15, 0)
+#define QM_SQC_VFT_NUM_SHIFT_V2 45
+#define QM_SQC_VFT_NUM_MASK_V2 GENMASK(9, 0)
+
+/* RW regs */
+#define QM_REGS_MAX_LEN 7
+#define QM_REG_ADDR_OFFSET 0x0004
+
+#define QM_XQC_ADDR_OFFSET 32U
+#define QM_VF_AEQ_INT_MASK 0x0004
+#define QM_VF_EQ_INT_MASK 0x000c
+#define QM_IFC_INT_SOURCE_V 0x0020
+#define QM_IFC_INT_MASK 0x0024
+#define QM_IFC_INT_SET_V 0x002c
+#define QM_QUE_ISO_CFG_V 0x0030
+#define QM_PAGE_SIZE 0x0034
+
+#define QM_EQC_DW0 0X8000
+#define QM_AEQC_DW0 0X8020
+
+struct acc_vf_data {
+#define QM_MATCH_SIZE 32L
+ /* QM match information */
+ u32 qp_num;
+ u32 dev_id;
+ u32 que_iso_cfg;
+ u32 qp_base;
+ /* QM reserved match information */
+ u32 qm_rsv_state[4];
+
+ /* QM RW regs */
+ u32 aeq_int_mask;
+ u32 eq_int_mask;
+ u32 ifc_int_source;
+ u32 ifc_int_mask;
+ u32 ifc_int_set;
+ u32 page_size;
+
+ /* QM_EQC_DW has 7 regs */
+ u32 qm_eqc_dw[7];
+
+ /* QM_AEQC_DW has 7 regs */
+ u32 qm_aeqc_dw[7];
+
+ /* QM reserved 5 regs */
+ u32 qm_rsv_regs[5];
+
+ /* qm memory init information */
+ u64 eqe_dma;
+ u64 aeqe_dma;
+ u64 sqc_dma;
+ u64 cqc_dma;
+};
+
+struct hisi_acc_vf_migration_file {
+ struct file *filp;
+ struct mutex lock;
+ bool disabled;
+
+ struct acc_vf_data vf_data;
+ size_t total_length;
+};
+
+struct hisi_acc_vf_core_device {
+ struct vfio_pci_core_device core_device;
+ u8 match_done:1;
+ /* for migration state */
+ struct mutex state_mutex;
+ enum vfio_device_mig_state mig_state;
+ struct pci_dev *pf_dev;
+ struct pci_dev *vf_dev;
+ struct hisi_qm *pf_qm;
+ struct hisi_qm vf_qm;
+ u32 vf_qm_state;
+ int vf_id;
+
+ struct hisi_acc_vf_migration_file resuming_migf;
+ struct hisi_acc_vf_migration_file saving_migf;
+};
+#endif /* HISI_ACC_VFIO_PCI_H */
--
2.25.1
On Mon, 28 Feb 2022 14:05:20 -0400
Jason Gunthorpe <[email protected]> wrote:
> On Mon, Feb 28, 2022 at 06:01:44PM +0000, Shameerali Kolothum Thodi wrote:
>
> > +static long hisi_acc_vf_save_unl_ioctl(struct file *filp,
> > + unsigned int cmd, unsigned long arg)
> > +{
> > + struct hisi_acc_vf_migration_file *migf = filp->private_data;
> > + loff_t *pos = &filp->f_pos;
> > + struct vfio_device_mig_precopy precopy;
> > + unsigned long minsz;
> > +
> > + if (cmd != VFIO_DEVICE_MIG_PRECOPY)
> > + return -EINVAL;
>
> ENOTTY
>
> > +
> > + minsz = offsetofend(struct vfio_device_mig_precopy, dirty_bytes);
> > +
> > + if (copy_from_user(&precopy, (void __user *)arg, minsz))
> > + return -EFAULT;
> > + if (precopy.argsz < minsz)
> > + return -EINVAL;
> > +
> > + mutex_lock(&migf->lock);
> > + if (*pos > migf->total_length) {
> > + mutex_unlock(&migf->lock);
> > + return -EINVAL;
> > + }
> > +
> > + precopy.dirty_bytes = 0;
> > + precopy.initial_bytes = migf->total_length - *pos;
> > + mutex_unlock(&migf->lock);
> > + return copy_to_user((void __user *)arg, &precopy, minsz) ? -EFAULT : 0;
> > +}
>
> Yes
>
> And I noticed this didn't include the ENOMSG handling, read() should
> return ENOMSG when it reaches EOS for the pre-copy:
>
> + * During pre-copy the migration data FD has a temporary "end of stream" that is
> + * reached when both initial_bytes and dirty_byte are zero. For instance, this
> + * may indicate that the device is idle and not currently dirtying any internal
> + * state. When read() is done on this temporary end of stream the kernel driver
> + * should return ENOMSG from read(). Userspace can wait for more data (which may
> + * never come) by using poll.
I'm confused by your previous reply that the use of curr_state should
be eliminated, isn't this ioctl only valid while the device is in the
PRE_COPY or PRE_COPY_P2P states? Otherwise the STOP_COPY state would
have some expectation to be able to use this ioctl for devices
supporting PRE_COPY. I'd like to see the uapi clarify exactly what
states allow this ioctl and define the behavior of the ioctl when
transitioning out of those states with an open data_fd, ie. is it
defined to return an -errno once in STOP_COPY? Thanks,
Alex
On Mon, Feb 28, 2022 at 01:16:14PM -0700, Alex Williamson wrote:
> On Mon, 28 Feb 2022 14:05:20 -0400
> Jason Gunthorpe <[email protected]> wrote:
>
> > On Mon, Feb 28, 2022 at 06:01:44PM +0000, Shameerali Kolothum Thodi wrote:
> >
> > > +static long hisi_acc_vf_save_unl_ioctl(struct file *filp,
> > > + unsigned int cmd, unsigned long arg)
> > > +{
> > > + struct hisi_acc_vf_migration_file *migf = filp->private_data;
> > > + loff_t *pos = &filp->f_pos;
> > > + struct vfio_device_mig_precopy precopy;
> > > + unsigned long minsz;
> > > +
> > > + if (cmd != VFIO_DEVICE_MIG_PRECOPY)
> > > + return -EINVAL;
> >
> > ENOTTY
> >
> > > +
> > > + minsz = offsetofend(struct vfio_device_mig_precopy, dirty_bytes);
> > > +
> > > + if (copy_from_user(&precopy, (void __user *)arg, minsz))
> > > + return -EFAULT;
> > > + if (precopy.argsz < minsz)
> > > + return -EINVAL;
> > > +
> > > + mutex_lock(&migf->lock);
> > > + if (*pos > migf->total_length) {
> > > + mutex_unlock(&migf->lock);
> > > + return -EINVAL;
> > > + }
> > > +
> > > + precopy.dirty_bytes = 0;
> > > + precopy.initial_bytes = migf->total_length - *pos;
> > > + mutex_unlock(&migf->lock);
> > > + return copy_to_user((void __user *)arg, &precopy, minsz) ? -EFAULT : 0;
> > > +}
> >
> > Yes
> >
> > And I noticed this didn't include the ENOMSG handling, read() should
> > return ENOMSG when it reaches EOS for the pre-copy:
> >
> > + * During pre-copy the migration data FD has a temporary "end of stream" that is
> > + * reached when both initial_bytes and dirty_byte are zero. For instance, this
> > + * may indicate that the device is idle and not currently dirtying any internal
> > + * state. When read() is done on this temporary end of stream the kernel driver
> > + * should return ENOMSG from read(). Userspace can wait for more data (which may
> > + * never come) by using poll.
>
> I'm confused by your previous reply that the use of curr_state should
> be eliminated, isn't this ioctl only valid while the device is in the
> PRE_COPY or PRE_COPY_P2P states? Otherwise the STOP_COPY state would
> have some expectation to be able to use this ioctl for devices
> supporting PRE_COPY.
I think it is fine to keep working on stop copy, though the
implementation here isn't quite right for that..
if (migf->total_length > QM_MATCH_SIZE)
precopy.dirty_bytes = migf->total_length - QM_MATCH_SIZE - *pos;
else
precopy.dity_bytes = 0;
if (*pos < QM_MATCH_SIZE)
precopy.initial_bytes = QM_MATCH_SIZE - *pos;
else
precopy.initial_Bytes = 0;
Unless you think we should block it.
> I'd like to see the uapi clarify exactly what states allow this
> ioctl and define the behavior of the ioctl when transitioning out of
> those states with an open data_fd, ie. is it defined to return an
> -errno once in STOP_COPY? Thanks,
The ioctl is on the data_fd, so it should follow all the normal rules
of the data_fd just like read() - ie all ioctls/read/write fails when
teh state is moved outside one where the data_fd is valid.
That looks like another issue with the above, it doesn't chck
migf->disabled.
Should we add another sentence about this?
Jason
On Mon, 28 Feb 2022 16:29:19 -0400
Jason Gunthorpe <[email protected]> wrote:
> On Mon, Feb 28, 2022 at 01:16:14PM -0700, Alex Williamson wrote:
> > On Mon, 28 Feb 2022 14:05:20 -0400
> > Jason Gunthorpe <[email protected]> wrote:
> >
> > > On Mon, Feb 28, 2022 at 06:01:44PM +0000, Shameerali Kolothum Thodi wrote:
> > >
> > > > +static long hisi_acc_vf_save_unl_ioctl(struct file *filp,
> > > > + unsigned int cmd, unsigned long arg)
> > > > +{
> > > > + struct hisi_acc_vf_migration_file *migf = filp->private_data;
> > > > + loff_t *pos = &filp->f_pos;
> > > > + struct vfio_device_mig_precopy precopy;
> > > > + unsigned long minsz;
> > > > +
> > > > + if (cmd != VFIO_DEVICE_MIG_PRECOPY)
> > > > + return -EINVAL;
> > >
> > > ENOTTY
> > >
> > > > +
> > > > + minsz = offsetofend(struct vfio_device_mig_precopy, dirty_bytes);
> > > > +
> > > > + if (copy_from_user(&precopy, (void __user *)arg, minsz))
> > > > + return -EFAULT;
> > > > + if (precopy.argsz < minsz)
> > > > + return -EINVAL;
> > > > +
> > > > + mutex_lock(&migf->lock);
> > > > + if (*pos > migf->total_length) {
> > > > + mutex_unlock(&migf->lock);
> > > > + return -EINVAL;
> > > > + }
> > > > +
> > > > + precopy.dirty_bytes = 0;
> > > > + precopy.initial_bytes = migf->total_length - *pos;
> > > > + mutex_unlock(&migf->lock);
> > > > + return copy_to_user((void __user *)arg, &precopy, minsz) ? -EFAULT : 0;
> > > > +}
> > >
> > > Yes
> > >
> > > And I noticed this didn't include the ENOMSG handling, read() should
> > > return ENOMSG when it reaches EOS for the pre-copy:
> > >
> > > + * During pre-copy the migration data FD has a temporary "end of stream" that is
> > > + * reached when both initial_bytes and dirty_byte are zero. For instance, this
> > > + * may indicate that the device is idle and not currently dirtying any internal
> > > + * state. When read() is done on this temporary end of stream the kernel driver
> > > + * should return ENOMSG from read(). Userspace can wait for more data (which may
> > > + * never come) by using poll.
> >
> > I'm confused by your previous reply that the use of curr_state should
> > be eliminated, isn't this ioctl only valid while the device is in the
> > PRE_COPY or PRE_COPY_P2P states? Otherwise the STOP_COPY state would
> > have some expectation to be able to use this ioctl for devices
> > supporting PRE_COPY.
>
> I think it is fine to keep working on stop copy, though the
> implementation here isn't quite right for that..
>
> if (migf->total_length > QM_MATCH_SIZE)
> precopy.dirty_bytes = migf->total_length - QM_MATCH_SIZE - *pos;
> else
> precopy.dity_bytes = 0;
>
> if (*pos < QM_MATCH_SIZE)
> precopy.initial_bytes = QM_MATCH_SIZE - *pos;
> else
> precopy.initial_Bytes = 0;
>
> Unless you think we should block it.
What's the meaning of initial_bytes and dirty_bytes while in STOP_COPY?
It seems like these become meaningless and if so, why shouldn't the
ioctl simply return -EINVAL if the device state doesn't match the
window where it's useful?
> > I'd like to see the uapi clarify exactly what states allow this
> > ioctl and define the behavior of the ioctl when transitioning out of
> > those states with an open data_fd, ie. is it defined to return an
> > -errno once in STOP_COPY? Thanks,
>
> The ioctl is on the data_fd, so it should follow all the normal rules
> of the data_fd just like read() - ie all ioctls/read/write fails when
> teh state is moved outside one where the data_fd is valid.
>
> That looks like another issue with the above, it doesn't chck
> migf->disabled.
>
> Should we add another sentence about this?
Right, of course the ioctl goes away when the data_fd is invalid, the
question is more that we've created this PRE_COPY_* specific ioctl and
what does it mean to call it when not in a device state where the
data_fd is still valid but this ioctl is really not. We should
specify how the driver is intended to respond to this ioctl in
STOP_COPY. Thanks,
Alex
On Mon, 28 Feb 2022 19:47:09 -0400
Jason Gunthorpe <[email protected]> wrote:
> On Mon, Feb 28, 2022 at 02:20:34PM -0700, Alex Williamson wrote:
>
> > > Unless you think we should block it.
> >
> > What's the meaning of initial_bytes and dirty_bytes while in
> > STOP_COPY?
>
> Same as during pre-copy - both numbers are the bytes remaining to be
> read() from the FD in each bucket. They should continue to decline as
> read() progresses regardless of what state the data_fd is in.
>
> The only special thing about STOP_COPY is that dirty_bytes should not
> increase as the device should not be generating new dirty data.
>
> How about:
>
> * Drivers should attempt to return estimates so that initial_bytes +
> * dirty_bytes matches the amount of data an immediate transition to STOP_COPY
> * will require to be streamed. While in STOP_COPY the initial_bytes
> * and dirty_bytes should continue to be decrease as the data_fd
> * progresses streaming out the data.
>
> Remove the 'in the precopy phase' from the first sentance
>
> Adjust the last paragraph as:
>
> + * returning readable. ENOMSG may not be returned in STOP_COPY. Support
> + * for this ioctl is required when VFIO_MIGRATION_PRE_COPY is set.
This entire ioctl on the data_fd seems a bit strange given the previous
fuss about how difficult it is for a driver to estimate their migration
data size. Now drivers are forced to provide those estimates even if
they only intend to use PRE_COPY as an early compatibility test?
Obviously it's trivial for the acc driver that doesn't support dirty
tracking and only has a fixed size migration structure, but it seems to
contradict your earlier statements. For instance, can mlx5 implement
a PRE_COPY solely for compatibility testing or is it blocked by an
inability to provide data estimates for this ioctl?
Now if we propose that this ioctl is useful during the STOP_COPY phase,
how does a non-PRE_COPY driver opt-in to that beneficial use case? Do
we later add a different, optional ioctl for non-PRE_COPY and then
require userspace to support two different methods of getting remaining
data estimates for a device in STOP_COPY?
If our primary goal is to simplify the FSM, I'm actually a little
surprised we support the PRE_COPY* -> STOP_COPY transition directly
versus passing through STOP. It seems this exists due to our policy
that we can only generate one data_fd as a result of any sequence of
state transitions, but I think there might also be an option to achieve
similar if the PRE_COPY* states are skipped if they aren't the ultimate
end state of the arc. I'm sure that raises questions about how we
correlate a PRE_COPY* session to a STOP_COPY session though, but this
PRE_COPY* specific but ongoing usage in STOP_COPY ioctl seems ad-hoc.
Thanks,
Alex
On Tue, Mar 01, 2022 at 12:30:47PM -0700, Alex Williamson wrote:
> Wouldn't it make more sense if initial-bytes started at QM_MATCH_SIZE
> and dirty-bytes was always sizeof(vf_data) - QM_MATCH_SIZE? ie. QEMU
> would know that it has sizeof(vf_data) - QM_MATCH_SIZE remaining even
> while it's getting ENOMSG after reading QM_MATCH_SIZE bytes of data.
The purpose of this ioctl is to help userspace guess when moving on to
STOP_COPY is a good idea ie when the device has done almost all the
work it is going to be able to do in PRE_COPY. ENOMSG is a similar
indicator.
I expect all devices to have some additional STOP_COPY trailer_data in
addition to their PRE_COPY initial_data and dirty_data
There is a choice to make if we report the trailer_data during
PRE_COPY or not. As this is all estimates, it doesn't matter unless
the trailer_data is very big.
Having all devices trend toward a 0 dirty_bytes to say they are are
done all the pre-copy they can do makes sense from an API
perspective. If one device trends toward 10MB due to a big
trailer_data and one trends toward 0 bytes, how will qemu consistently
decide when best to trigger STOP_COPY? It makes the API less useful.
So, I would not include trailer_data in the dirty_bytes.
Estimating when to move on to STOP_COPY and trying to enforce a SLA on
STOP_COPY are different tasks and will probably end up with different
interfaces.
I still think the right way to approach the SLA is to inform the
driver what the permitted time and data size target is for STOP_COPY
and the driver can proceed or not based on its own internal
calculation.
> useful yet and you don't want to add dead kernel code, then let's
> define that this ioctl is only available in the PRE_COPY* states and
> returns -errno in the STOP_COPY state.
I'm OK with that, in acc it is done by checking migf->total_bytes >
QM_MATCH_SIZE during the read fop
> devices in STOP_COPY and let's also define if there's actually anything
> userspace can infer about remaining STOP_COPY data size while in
> PRE_COPY* via this ioctl. For example, is dirty-bytes zero or the
> remaining data structure size?
If we keep it then I would say it doesn't matter, userspace has to sum
the two values to get the total remaining length estimate, it is just
a bit quirky.
Jason
On Tue, 1 Mar 2022 16:39:38 -0400
Jason Gunthorpe <[email protected]> wrote:
> On Tue, Mar 01, 2022 at 12:30:47PM -0700, Alex Williamson wrote:
> > Wouldn't it make more sense if initial-bytes started at QM_MATCH_SIZE
> > and dirty-bytes was always sizeof(vf_data) - QM_MATCH_SIZE? ie. QEMU
> > would know that it has sizeof(vf_data) - QM_MATCH_SIZE remaining even
> > while it's getting ENOMSG after reading QM_MATCH_SIZE bytes of data.
>
> The purpose of this ioctl is to help userspace guess when moving on to
> STOP_COPY is a good idea ie when the device has done almost all the
> work it is going to be able to do in PRE_COPY. ENOMSG is a similar
> indicator.
>
> I expect all devices to have some additional STOP_COPY trailer_data in
> addition to their PRE_COPY initial_data and dirty_data
>
> There is a choice to make if we report the trailer_data during
> PRE_COPY or not. As this is all estimates, it doesn't matter unless
> the trailer_data is very big.
>
> Having all devices trend toward a 0 dirty_bytes to say they are are
> done all the pre-copy they can do makes sense from an API
> perspective. If one device trends toward 10MB due to a big
> trailer_data and one trends toward 0 bytes, how will qemu consistently
> decide when best to trigger STOP_COPY? It makes the API less useful.
>
> So, I would not include trailer_data in the dirty_bytes.
That assumes that it's possible to keep up with the device dirty rate.
It seems like a better approach for userspace would be to look at how
dirty_bytes is trending. A zero value and a steady state value are
equivalent, there's nothing more to be gained by further iterations. If
the value is trending down, it might be worthwhile to iterate in
PRE_COPY a while longer. If the value is trending up, it might be time
to cut to STOP_COPY or abort the migration.
If we exclude STOP_COPY trailing data from the VFIO_DEVICE_MIG_PRECOPY
ioctl, it seems even more of a disconnect that when we enter the
STOP_COPY state, suddenly we start getting new data out of a PRECOPY
ioctl.
BTW, "VFIO_DEVICE" should be reserved for ioctls and data structures
relative to the device FD, appending it with _MIG is too subtle for me.
This is also a GET operation for INFO, so I'd think for consistency
with the existing vfio uAPI we'd name this something like
VFIO_MIG_GET_PRECOPY_INFO where the structure might be named
vfio_precopy_info.
> Estimating when to move on to STOP_COPY and trying to enforce a SLA on
> STOP_COPY are different tasks and will probably end up with different
> interfaces.
>
> I still think the right way to approach the SLA is to inform the
> driver what the permitted time and data size target is for STOP_COPY
> and the driver can proceed or not based on its own internal
> calculation.
So if we don't think this is the right approach for STOP_COPY, then why
are we pushing that it has any purpose outside of PRECOPY or might be
implemented by a non-PRECOPY driver for use in STOP_COPY?
> > useful yet and you don't want to add dead kernel code, then let's
> > define that this ioctl is only available in the PRE_COPY* states and
> > returns -errno in the STOP_COPY state.
>
> I'm OK with that, in acc it is done by checking migf->total_bytes >
> QM_MATCH_SIZE during the read fop
>
> > devices in STOP_COPY and let's also define if there's actually anything
> > userspace can infer about remaining STOP_COPY data size while in
> > PRE_COPY* via this ioctl. For example, is dirty-bytes zero or the
> > remaining data structure size?
>
> If we keep it then I would say it doesn't matter, userspace has to sum
> the two values to get the total remaining length estimate, it is just
> a bit quirky.
For the reasons above, I just can't figure out why wouldn't decide that
use of this outside of PRECOPY is too quirky to bother with. Thanks,
Alex
On Tue, Mar 01, 2022 at 03:44:31PM -0700, Alex Williamson wrote:
> On Tue, 1 Mar 2022 16:39:38 -0400
> Jason Gunthorpe <[email protected]> wrote:
>
> > On Tue, Mar 01, 2022 at 12:30:47PM -0700, Alex Williamson wrote:
> > > Wouldn't it make more sense if initial-bytes started at QM_MATCH_SIZE
> > > and dirty-bytes was always sizeof(vf_data) - QM_MATCH_SIZE? ie. QEMU
> > > would know that it has sizeof(vf_data) - QM_MATCH_SIZE remaining even
> > > while it's getting ENOMSG after reading QM_MATCH_SIZE bytes of data.
> >
> > The purpose of this ioctl is to help userspace guess when moving on to
> > STOP_COPY is a good idea ie when the device has done almost all the
> > work it is going to be able to do in PRE_COPY. ENOMSG is a similar
> > indicator.
> >
> > I expect all devices to have some additional STOP_COPY trailer_data in
> > addition to their PRE_COPY initial_data and dirty_data
> >
> > There is a choice to make if we report the trailer_data during
> > PRE_COPY or not. As this is all estimates, it doesn't matter unless
> > the trailer_data is very big.
> >
> > Having all devices trend toward a 0 dirty_bytes to say they are are
> > done all the pre-copy they can do makes sense from an API
> > perspective. If one device trends toward 10MB due to a big
> > trailer_data and one trends toward 0 bytes, how will qemu consistently
> > decide when best to trigger STOP_COPY? It makes the API less useful.
> >
> > So, I would not include trailer_data in the dirty_bytes.
>
> That assumes that it's possible to keep up with the device dirty
> rate.
It keeps options open so we have this choice someday.
We already see that implementations are using vCPU throttling as part
of their migration strategy, and we are seriously looking at DMA
throttling. It is not a big leap to imagine that
internal-state-dirtying throttling will happne someday.
With throttling iterations would ratchet up the throttle until they
reach an absolute small amount of dirty then cut over to STOP_COPY
> It seems like a better approach for userspace would be to look at how
> dirty_bytes is trending.
It may be biw, but this approach doesn't care if the trailing_bytes
are included or not, so lets leave them out and preserve the other
operating model.
> If we exclude STOP_COPY trailing data from the VFIO_DEVICE_MIG_PRECOPY
> ioctl, it seems even more of a disconnect that when we enter the
> STOP_COPY state, suddenly we start getting new data out of a PRECOPY
> ioctl.
Why? That amounts can go up at any time, how does it matter if it goes
up after STOP_COPY or instantly before?
> BTW, "VFIO_DEVICE" should be reserved for ioctls and data structures
> relative to the device FD, appending it with _MIG is too subtle for me.
> This is also a GET operation for INFO, so I'd think for consistency
> with the existing vfio uAPI we'd name this something like
> VFIO_MIG_GET_PRECOPY_INFO where the structure might be named
> vfio_precopy_info.
Sure
> So if we don't think this is the right approach for STOP_COPY, then why
> are we pushing that it has any purpose outside of PRECOPY or might be
> implemented by a non-PRECOPY driver for use in STOP_COPY?
It is just simpler and more consistent to implement the math under
this ioctl in all cases then to try and artificially restrict it.
But I don't have a use case for it, so lets block it if you prefer.
Shameerali will you make these adjustments to the PRE_COPY patch?
Thanks,
Jason
> -----Original Message-----
> From: Jason Gunthorpe [mailto:[email protected]]
> Sent: 02 March 2022 00:03
> To: Alex Williamson <[email protected]>
> Cc: Shameerali Kolothum Thodi <[email protected]>;
> [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; Linuxarm <[email protected]>; liulongfang
> <[email protected]>; Zengtao (B) <[email protected]>;
> Jonathan Cameron <[email protected]>; Wangzhou (B)
> <[email protected]>
> Subject: Re: [PATCH v6 09/10] hisi_acc_vfio_pci: Add support for VFIO live
> migration
>
> On Tue, Mar 01, 2022 at 03:44:31PM -0700, Alex Williamson wrote:
> > On Tue, 1 Mar 2022 16:39:38 -0400
> > Jason Gunthorpe <[email protected]> wrote:
> >
> > > On Tue, Mar 01, 2022 at 12:30:47PM -0700, Alex Williamson wrote:
> > > > Wouldn't it make more sense if initial-bytes started at QM_MATCH_SIZE
> > > > and dirty-bytes was always sizeof(vf_data) - QM_MATCH_SIZE? ie.
> QEMU
> > > > would know that it has sizeof(vf_data) - QM_MATCH_SIZE remaining even
> > > > while it's getting ENOMSG after reading QM_MATCH_SIZE bytes of data.
> > >
> > > The purpose of this ioctl is to help userspace guess when moving on to
> > > STOP_COPY is a good idea ie when the device has done almost all the
> > > work it is going to be able to do in PRE_COPY. ENOMSG is a similar
> > > indicator.
> > >
> > > I expect all devices to have some additional STOP_COPY trailer_data in
> > > addition to their PRE_COPY initial_data and dirty_data
> > >
> > > There is a choice to make if we report the trailer_data during
> > > PRE_COPY or not. As this is all estimates, it doesn't matter unless
> > > the trailer_data is very big.
> > >
> > > Having all devices trend toward a 0 dirty_bytes to say they are are
> > > done all the pre-copy they can do makes sense from an API
> > > perspective. If one device trends toward 10MB due to a big
> > > trailer_data and one trends toward 0 bytes, how will qemu consistently
> > > decide when best to trigger STOP_COPY? It makes the API less useful.
> > >
> > > So, I would not include trailer_data in the dirty_bytes.
> >
> > That assumes that it's possible to keep up with the device dirty
> > rate.
>
> It keeps options open so we have this choice someday.
>
> We already see that implementations are using vCPU throttling as part
> of their migration strategy, and we are seriously looking at DMA
> throttling. It is not a big leap to imagine that
> internal-state-dirtying throttling will happne someday.
>
> With throttling iterations would ratchet up the throttle until they
> reach an absolute small amount of dirty then cut over to STOP_COPY
>
> > It seems like a better approach for userspace would be to look at how
> > dirty_bytes is trending.
>
> It may be biw, but this approach doesn't care if the trailing_bytes
> are included or not, so lets leave them out and preserve the other
> operating model.
>
> > If we exclude STOP_COPY trailing data from the VFIO_DEVICE_MIG_PRECOPY
> > ioctl, it seems even more of a disconnect that when we enter the
> > STOP_COPY state, suddenly we start getting new data out of a PRECOPY
> > ioctl.
>
> Why? That amounts can go up at any time, how does it matter if it goes
> up after STOP_COPY or instantly before?
>
> > BTW, "VFIO_DEVICE" should be reserved for ioctls and data structures
> > relative to the device FD, appending it with _MIG is too subtle for me.
> > This is also a GET operation for INFO, so I'd think for consistency
> > with the existing vfio uAPI we'd name this something like
> > VFIO_MIG_GET_PRECOPY_INFO where the structure might be named
> > vfio_precopy_info.
>
> Sure
>
> > So if we don't think this is the right approach for STOP_COPY, then why
> > are we pushing that it has any purpose outside of PRECOPY or might be
> > implemented by a non-PRECOPY driver for use in STOP_COPY?
>
> It is just simpler and more consistent to implement the math under
> this ioctl in all cases then to try and artificially restrict it.
>
> But I don't have a use case for it, so lets block it if you prefer.
>
> Shameerali will you make these adjustments to the PRE_COPY patch?
Sure. I think we can summarize the discussion as below,
- Rename the MIG_PRECOPY ioctl to VFIO_MIG_GET_PRECOPY_INFO and
structure to vfio_precopy_info.
- This ioctl is only valid in PRE_COPY state and should return -EINVAL in
other states(Update the documentation).
- No changes to the initial_bytes & dirty_bytes descriptions.
Please let me know if I missed anything.
I will address other comments on this series as well and sent out a
revised one soon.
Thanks,
Shameer