ioasid_set was introduced as an arbitrary token that is shared by a
group of IOASIDs. For example, two IOASIDs allocated via the same
ioasid_set pointer belong to the same set.
For guest SVA usages, system-wide IOASID resources need to be
partitioned such that each VM can have its own quota and being managed
separately. ioasid_set is the perfect candidate for meeting such
requirements. This patch redefines and extends ioasid_set with the
following new fields:
- Quota
- Reference count
- Storage of its namespace
- The token is now stored in the ioasid_set with types
Basic ioasid_set level APIs are introduced that wire up these new data.
Existing users of IOASID APIs are converted where a host IOASID set is
allocated for bare-metal usages. Including VT-d driver and
iommu-sva-lib.
Signed-off-by: Liu Yi L <[email protected]>
Signed-off-by: Jacob Pan <[email protected]>
---
.../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 1 +
drivers/iommu/intel/iommu.c | 27 +-
drivers/iommu/intel/pasid.h | 1 +
drivers/iommu/intel/svm.c | 25 +-
drivers/iommu/ioasid.c | 288 +++++++++++++++---
drivers/iommu/iommu-sva-lib.c | 19 +-
include/linux/ioasid.h | 68 ++++-
7 files changed, 361 insertions(+), 68 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index e13b092e6004..588aa66ed5e4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -459,6 +459,7 @@ int arm_smmu_master_enable_sva(struct arm_smmu_master *master)
{
mutex_lock(&sva_lock);
master->sva_enabled = true;
+ iommu_sva_init();
mutex_unlock(&sva_lock);
return 0;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6f42ff7d171d..eb9868061545 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -103,6 +103,9 @@
*/
#define INTEL_IOMMU_PGSIZES (~0xFFFUL)
+/* PASIDs used by host SVM */
+struct ioasid_set *host_pasid_set;
+
static inline int agaw_to_level(int agaw)
{
return agaw + 2;
@@ -173,6 +176,7 @@ static struct intel_iommu **g_iommus;
static void __init check_tylersburg_isoch(void);
static int rwbf_quirk;
+static bool scalable_mode_support(void);
/*
* set to 1 to panic kernel if can't successfully enable VT-d
@@ -3114,8 +3118,8 @@ static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
* Sanity check the ioasid owner is done at upper layer, e.g. VFIO
* We can only free the PASID when all the devices are unbound.
*/
- if (ioasid_find(NULL, ioasid, NULL)) {
- pr_alert("Cannot free active IOASID %d\n", ioasid);
+ if (IS_ERR(ioasid_find(host_pasid_set, ioasid, NULL))) {
+ pr_err("IOASID %d to be freed but not in system set\n", ioasid);
return;
}
vcmd_free_pasid(iommu, ioasid);
@@ -3300,8 +3304,17 @@ static int __init init_dmars(void)
goto free_iommu;
/* PASID is needed for scalable mode irrespective to SVM */
- if (intel_iommu_sm)
+ if (scalable_mode_support()) {
ioasid_install_capacity(intel_pasid_max_id);
+ /* We should not run out of IOASIDs at boot */
+ host_pasid_set = ioasid_set_alloc(NULL, PID_MAX_DEFAULT,
+ IOASID_SET_TYPE_NULL);
+ if (IS_ERR_OR_NULL(host_pasid_set)) {
+ pr_err("Failed to allocate host PASID set %lu\n",
+ PTR_ERR(host_pasid_set));
+ intel_iommu_sm = 0;
+ }
+ }
/*
* for each drhd
@@ -3348,7 +3361,7 @@ static int __init init_dmars(void)
disable_dmar_iommu(iommu);
free_dmar_iommu(iommu);
}
-
+ ioasid_set_free(host_pasid_set);
kfree(g_iommus);
error:
@@ -4573,7 +4586,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
u32 pasid;
/* No private data needed for the default pasid */
- pasid = ioasid_alloc(NULL, PASID_MIN,
+ pasid = ioasid_alloc(host_pasid_set, PASID_MIN,
pci_max_pasids(to_pci_dev(dev)) - 1,
NULL);
if (pasid == INVALID_IOASID) {
@@ -4630,7 +4643,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
link_failed:
spin_unlock_irqrestore(&device_domain_lock, flags);
if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
- ioasid_put(domain->default_pasid);
+ ioasid_put(host_pasid_set, domain->default_pasid);
return ret;
}
@@ -4660,7 +4673,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain,
spin_unlock_irqrestore(&device_domain_lock, flags);
if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
- ioasid_put(domain->default_pasid);
+ ioasid_put(host_pasid_set, domain->default_pasid);
}
static int prepare_domain_attach_device(struct iommu_domain *domain,
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 97dfcffbf495..12b5ca18de5d 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -99,6 +99,7 @@ static inline bool pasid_pte_is_present(struct pasid_entry *pte)
}
extern unsigned int intel_pasid_max_id;
+extern struct ioasid_set *host_pasid_set;
int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp);
void intel_pasid_free_id(u32 pasid);
void *intel_pasid_lookup_id(u32 pasid);
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 68372a7eb8b5..c469c24d23f5 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -247,7 +247,9 @@ static LIST_HEAD(global_svm_list);
list_for_each_entry((sdev), &(svm)->devs, list) \
if ((d) != (sdev)->dev) {} else
-static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
+static int pasid_to_svm_sdev(struct device *dev,
+ struct ioasid_set *set,
+ unsigned int pasid,
struct intel_svm **rsvm,
struct intel_svm_dev **rsdev)
{
@@ -261,7 +263,7 @@ static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
return -EINVAL;
- svm = ioasid_find(NULL, pasid, NULL);
+ svm = ioasid_find(set, pasid, NULL);
if (IS_ERR(svm))
return PTR_ERR(svm);
@@ -337,7 +339,8 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
dmar_domain = to_dmar_domain(domain);
mutex_lock(&pasid_mutex);
- ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev);
+ ret = pasid_to_svm_sdev(dev, NULL,
+ data->hpasid, &svm, &sdev);
if (ret)
goto out;
@@ -444,7 +447,7 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
return -EINVAL;
mutex_lock(&pasid_mutex);
- ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
+ ret = pasid_to_svm_sdev(dev, NULL, pasid, &svm, &sdev);
if (ret)
goto out;
@@ -602,7 +605,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
pasid_max = intel_pasid_max_id;
/* Do not use PASID 0, reserved for RID to PASID */
- svm->pasid = ioasid_alloc(NULL, PASID_MIN,
+ svm->pasid = ioasid_alloc(host_pasid_set, PASID_MIN,
pasid_max - 1, svm);
if (svm->pasid == INVALID_IOASID) {
kfree(svm);
@@ -619,7 +622,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
if (mm) {
ret = mmu_notifier_register(&svm->notifier, mm);
if (ret) {
- ioasid_put(svm->pasid);
+ ioasid_put(host_pasid_set, svm->pasid);
kfree(svm);
kfree(sdev);
goto out;
@@ -637,7 +640,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
if (ret) {
if (mm)
mmu_notifier_unregister(&svm->notifier, mm);
- ioasid_put(svm->pasid);
+ ioasid_put(host_pasid_set, svm->pasid);
kfree(svm);
kfree(sdev);
goto out;
@@ -689,7 +692,8 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
if (!iommu)
goto out;
- ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
+ ret = pasid_to_svm_sdev(dev, host_pasid_set,
+ pasid, &svm, &sdev);
if (ret)
goto out;
@@ -710,7 +714,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
kfree_rcu(sdev, rcu);
if (list_empty(&svm->devs)) {
- ioasid_put(svm->pasid);
+ ioasid_put(host_pasid_set, svm->pasid);
if (svm->mm) {
mmu_notifier_unregister(&svm->notifier, svm->mm);
/* Clear mm's pasid. */
@@ -1184,7 +1188,8 @@ int intel_svm_page_response(struct device *dev,
goto out;
}
- ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev);
+ ret = pasid_to_svm_sdev(dev, host_pasid_set,
+ prm->pasid, &svm, &sdev);
if (ret || !sdev) {
ret = -ENODEV;
goto out;
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 28681b99340b..d7b476651027 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
/*
* I/O Address Space ID allocator. There is one global IOASID space, split into
- * subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and
- * free IOASIDs with ioasid_alloc and ioasid_put.
+ * sets. Users create a set with ioasid_set_alloc, then allocate/free IDs
+ * with ioasid_alloc, ioasid_put, and ioasid_free.
*/
#include <linux/ioasid.h>
#include <linux/module.h>
@@ -14,6 +14,7 @@
#define PCI_PASID_MAX 0x100000
static ioasid_t ioasid_capacity = PCI_PASID_MAX;
static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
+static DEFINE_XARRAY_ALLOC(ioasid_sets);
struct ioasid_data {
ioasid_t id;
struct ioasid_set *set;
@@ -394,6 +395,151 @@ void ioasid_detach_data(ioasid_t ioasid)
}
EXPORT_SYMBOL_GPL(ioasid_detach_data);
+static inline bool ioasid_set_is_valid(struct ioasid_set *set)
+{
+ return xa_load(&ioasid_sets, set->id) == set;
+}
+
+/**
+ * ioasid_set_alloc - Allocate a new IOASID set for a given token
+ *
+ * @token: An optional arbitrary number that can be associated with the
+ * IOASID set. @token can be NULL if the type is
+ * IOASID_SET_TYPE_NULL
+ * @quota: Quota allowed in this set, 0 indicates no limit for the set
+ * @type: The type of the token used to create the IOASID set
+ *
+ * IOASID is limited system-wide resource that requires quota management.
+ * Token will be stored in the ioasid_set returned. A reference will be taken
+ * on the newly created set. Subsequent IOASID allocation within the set need
+ * to use the returned ioasid_set pointer.
+ */
+struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t quota, int type)
+{
+ struct ioasid_set *set;
+ unsigned long index;
+ ioasid_t id;
+
+ if (type >= IOASID_SET_TYPE_NR)
+ return ERR_PTR(-EINVAL);
+
+ /* No limit for the set, use whatever is available on the system */
+ if (!quota)
+ quota = ioasid_capacity_avail;
+
+ spin_lock(&ioasid_allocator_lock);
+ if (quota > ioasid_capacity_avail) {
+ pr_warn("Out of IOASID capacity! ask %d, avail %d\n",
+ quota, ioasid_capacity_avail);
+ set = ERR_PTR(-ENOSPC);
+ goto exit_unlock;
+ }
+
+ /*
+ * Token is only unique within its types but right now we have only
+ * mm type. If we have more token types, we have to match type as well.
+ */
+ switch (type) {
+ case IOASID_SET_TYPE_MM:
+ if (!token) {
+ set = ERR_PTR(-EINVAL);
+ goto exit_unlock;
+ }
+ /* Search existing set tokens, reject duplicates */
+ xa_for_each(&ioasid_sets, index, set) {
+ if (set->token == token && set->type == IOASID_SET_TYPE_MM) {
+ set = ERR_PTR(-EEXIST);
+ goto exit_unlock;
+ }
+ }
+ break;
+ case IOASID_SET_TYPE_NULL:
+ if (!token)
+ break;
+ fallthrough;
+ default:
+ pr_err("Invalid token and IOASID type\n");
+ set = ERR_PTR(-EINVAL);
+ goto exit_unlock;
+ }
+
+ set = kzalloc(sizeof(*set), GFP_ATOMIC);
+ if (!set) {
+ set = ERR_PTR(-ENOMEM);
+ goto exit_unlock;
+ }
+
+ if (xa_alloc(&ioasid_sets, &id, set,
+ XA_LIMIT(0, ioasid_capacity_avail),
+ GFP_ATOMIC)) {
+ kfree(set);
+ set = ERR_PTR(-ENOSPC);
+ goto exit_unlock;
+ }
+
+ set->token = token;
+ set->type = type;
+ set->quota = quota;
+ set->id = id;
+ atomic_set(&set->nr_ioasids, 0);
+ /*
+ * Per set XA is used to store private IDs within the set, get ready
+ * for ioasid_set private ID and system-wide IOASID allocation
+ * results.
+ */
+ xa_init(&set->xa);
+ ioasid_capacity_avail -= quota;
+
+exit_unlock:
+ spin_unlock(&ioasid_allocator_lock);
+
+ return set;
+}
+EXPORT_SYMBOL_GPL(ioasid_set_alloc);
+
+static int ioasid_set_free_locked(struct ioasid_set *set)
+{
+ int ret = 0;
+
+ if (!ioasid_set_is_valid(set)) {
+ ret = -EINVAL;
+ goto exit_done;
+ }
+
+ if (atomic_read(&set->nr_ioasids)) {
+ ret = -EBUSY;
+ goto exit_done;
+ }
+
+ WARN_ON(!xa_empty(&set->xa));
+ /*
+ * Token got released right away after the ioasid_set is freed.
+ * If a new set is created immediately with the newly released token,
+ * it will not allocate the same IOASIDs unless they are reclaimed.
+ */
+ xa_erase(&ioasid_sets, set->id);
+ kfree_rcu(set, rcu);
+exit_done:
+ return ret;
+};
+
+/**
+ * @brief Free an ioasid_set if empty. Restore pending notification list.
+ *
+ * @param set to be freed
+ * @return
+ */
+int ioasid_set_free(struct ioasid_set *set)
+{
+ int ret = 0;
+
+ spin_lock(&ioasid_allocator_lock);
+ ret = ioasid_set_free_locked(set);
+ spin_unlock(&ioasid_allocator_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_set_free);
+
/**
* ioasid_alloc - Allocate an IOASID
* @set: the IOASID set
@@ -411,11 +557,22 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
{
struct ioasid_data *data;
void *adata;
- ioasid_t id;
+ ioasid_t id = INVALID_IOASID;
+
+ spin_lock(&ioasid_allocator_lock);
+ /* Check if the IOASID set has been allocated and initialized */
+ if (!ioasid_set_is_valid(set))
+ goto done_unlock;
+
+ if (set->quota <= atomic_read(&set->nr_ioasids)) {
+ pr_err_ratelimited("IOASID set out of quota %d\n",
+ set->quota);
+ goto done_unlock;
+ }
data = kzalloc(sizeof(*data), GFP_ATOMIC);
if (!data)
- return INVALID_IOASID;
+ goto done_unlock;
data->set = set;
data->private = private;
@@ -425,7 +582,6 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
* Custom allocator needs allocator data to perform platform specific
* operations.
*/
- spin_lock(&ioasid_allocator_lock);
adata = active_allocator->flags & IOASID_ALLOCATOR_CUSTOM ? active_allocator->ops->pdata : data;
id = active_allocator->ops->alloc(min, max, adata);
if (id == INVALID_IOASID) {
@@ -442,67 +598,121 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
}
data->id = id;
- spin_unlock(&ioasid_allocator_lock);
- return id;
+ /* Store IOASID in the per set data */
+ if (xa_err(xa_store(&set->xa, id, data, GFP_ATOMIC))) {
+ pr_err_ratelimited("Failed to store ioasid %d in set\n", id);
+ active_allocator->ops->free(id, active_allocator->ops->pdata);
+ goto exit_free;
+ }
+ atomic_inc(&set->nr_ioasids);
+ goto done_unlock;
exit_free:
- spin_unlock(&ioasid_allocator_lock);
kfree(data);
- return INVALID_IOASID;
+done_unlock:
+ spin_unlock(&ioasid_allocator_lock);
+ return id;
}
EXPORT_SYMBOL_GPL(ioasid_alloc);
+static void ioasid_do_free_locked(struct ioasid_data *data)
+{
+ struct ioasid_data *ioasid_data;
+
+ active_allocator->ops->free(data->id, active_allocator->ops->pdata);
+ /* Custom allocator needs additional steps to free the xa element */
+ if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
+ ioasid_data = xa_erase(&active_allocator->xa, data->id);
+ kfree_rcu(ioasid_data, rcu);
+ }
+ atomic_dec(&data->set->nr_ioasids);
+ xa_erase(&data->set->xa, data->id);
+ /* Destroy the set if empty */
+ if (!atomic_read(&data->set->nr_ioasids))
+ ioasid_set_free_locked(data->set);
+}
+
+int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+ struct ioasid_data *data;
+
+ data = xa_load(&active_allocator->xa, ioasid);
+ if (!data) {
+ pr_err("Trying to get unknown IOASID %u\n", ioasid);
+ return -EINVAL;
+ }
+
+ /* Check set ownership if the set is non-null */
+ if (set && data->set != set) {
+ pr_err("Trying to get IOASID %u outside the set\n", ioasid);
+ /* data found but does not belong to the set */
+ return -EACCES;
+ }
+ refcount_inc(&data->refs);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ioasid_get_locked);
+
/**
* ioasid_get - obtain a reference to the IOASID
+ * @set: the ioasid_set to check permission against if not NULL
+ * @ioasid: the IOASID to get reference
+ *
+ *
+ * Return: 0 on success, error if failed.
*/
-void ioasid_get(ioasid_t ioasid)
+int ioasid_get(struct ioasid_set *set, ioasid_t ioasid)
{
- struct ioasid_data *ioasid_data;
+ int ret;
spin_lock(&ioasid_allocator_lock);
- ioasid_data = xa_load(&active_allocator->xa, ioasid);
- if (ioasid_data)
- refcount_inc(&ioasid_data->refs);
- else
- WARN_ON(1);
+ ret = ioasid_get_locked(set, ioasid);
spin_unlock(&ioasid_allocator_lock);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(ioasid_get);
+bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+ struct ioasid_data *data;
+
+ data = xa_load(&active_allocator->xa, ioasid);
+ if (!data) {
+ pr_err("Trying to put unknown IOASID %u\n", ioasid);
+ return false;
+ }
+ if (set && data->set != set) {
+ pr_err("Trying to drop IOASID %u outside the set\n", ioasid);
+ return false;
+ }
+ if (!refcount_dec_and_test(&data->refs))
+ return false;
+
+ ioasid_do_free_locked(data);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(ioasid_put_locked);
+
/**
* ioasid_put - Release a reference to an ioasid
- * @ioasid: the ID to remove
+ * @set: the ioasid_set to check permission against if not NULL
+ * @ioasid: the IOASID to drop reference
*
* Put a reference to the IOASID, free it when the number of references drops to
* zero.
*
* Return: %true if the IOASID was freed, %false otherwise.
*/
-bool ioasid_put(ioasid_t ioasid)
+bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid)
{
- bool free = false;
- struct ioasid_data *ioasid_data;
+ bool ret;
spin_lock(&ioasid_allocator_lock);
- ioasid_data = xa_load(&active_allocator->xa, ioasid);
- if (!ioasid_data) {
- pr_err("Trying to free unknown IOASID %u\n", ioasid);
- goto exit_unlock;
- }
-
- free = refcount_dec_and_test(&ioasid_data->refs);
- if (!free)
- goto exit_unlock;
-
- active_allocator->ops->free(ioasid, active_allocator->ops->pdata);
- /* Custom allocator needs additional steps to free the xa element */
- if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
- ioasid_data = xa_erase(&active_allocator->xa, ioasid);
- kfree_rcu(ioasid_data, rcu);
- }
-
-exit_unlock:
+ ret = ioasid_put_locked(set, ioasid);
spin_unlock(&ioasid_allocator_lock);
- return free;
+ return ret;
}
EXPORT_SYMBOL_GPL(ioasid_put);
diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c
index bd41405d34e9..7f97a03a135b 100644
--- a/drivers/iommu/iommu-sva-lib.c
+++ b/drivers/iommu/iommu-sva-lib.c
@@ -8,7 +8,16 @@
#include "iommu-sva-lib.h"
static DEFINE_MUTEX(iommu_sva_lock);
-static DECLARE_IOASID_SET(iommu_sva_pasid);
+static struct ioasid_set *iommu_sva_pasid;
+
+/* Must be called before PASID allocations can occur */
+void iommu_sva_init(void)
+{
+ if (iommu_sva_pasid)
+ return;
+ iommu_sva_pasid = ioasid_set_alloc(NULL, 0, IOASID_SET_TYPE_NULL);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_init);
/**
* iommu_sva_alloc_pasid - Allocate a PASID for the mm
@@ -35,11 +44,11 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max)
mutex_lock(&iommu_sva_lock);
if (mm->pasid) {
if (mm->pasid >= min && mm->pasid <= max)
- ioasid_get(mm->pasid);
+ ioasid_get(iommu_sva_pasid, mm->pasid);
else
ret = -EOVERFLOW;
} else {
- pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
+ pasid = ioasid_alloc(iommu_sva_pasid, min, max, mm);
if (pasid == INVALID_IOASID)
ret = -ENOMEM;
else
@@ -59,7 +68,7 @@ EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid);
void iommu_sva_free_pasid(struct mm_struct *mm)
{
mutex_lock(&iommu_sva_lock);
- if (ioasid_put(mm->pasid))
+ if (ioasid_put(iommu_sva_pasid, mm->pasid))
mm->pasid = 0;
mutex_unlock(&iommu_sva_lock);
}
@@ -81,6 +90,6 @@ static bool __mmget_not_zero(void *mm)
*/
struct mm_struct *iommu_sva_find(ioasid_t pasid)
{
- return ioasid_find(&iommu_sva_pasid, pasid, __mmget_not_zero);
+ return ioasid_find(iommu_sva_pasid, pasid, __mmget_not_zero);
}
EXPORT_SYMBOL_GPL(iommu_sva_find);
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 2780bdc84b94..095f4e50dc58 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -4,14 +4,43 @@
#include <linux/types.h>
#include <linux/errno.h>
+#include <linux/xarray.h>
+#include <linux/refcount.h>
#define INVALID_IOASID ((ioasid_t)-1)
typedef unsigned int ioasid_t;
typedef ioasid_t (*ioasid_alloc_fn_t)(ioasid_t min, ioasid_t max, void *data);
typedef void (*ioasid_free_fn_t)(ioasid_t ioasid, void *data);
+/* IOASID set types */
+enum ioasid_set_type {
+ IOASID_SET_TYPE_NULL = 1, /* Set token is NULL */
+ IOASID_SET_TYPE_MM, /* Set token is a mm_struct pointer
+ * i.e. associated with a process
+ */
+ IOASID_SET_TYPE_NR,
+};
+
+/**
+ * struct ioasid_set - Meta data about ioasid_set
+ * @nh: List of notifiers private to that set
+ * @xa: XArray to store ioasid_set private IDs, can be used for
+ * guest-host IOASID mapping, or just a private IOASID namespace.
+ * @token: Unique to identify an IOASID set
+ * @type: Token types
+ * @quota: Max number of IOASIDs can be allocated within the set
+ * @nr_ioasids: Number of IOASIDs currently allocated in the set
+ * @id: ID of the set
+ */
struct ioasid_set {
- int dummy;
+ struct atomic_notifier_head nh;
+ struct xarray xa;
+ void *token;
+ int type;
+ int quota;
+ atomic_t nr_ioasids;
+ int id;
+ struct rcu_head rcu;
};
/**
@@ -29,17 +58,20 @@ struct ioasid_allocator_ops {
void *pdata;
};
-#define DECLARE_IOASID_SET(name) struct ioasid_set name = { 0 }
-
#if IS_ENABLED(CONFIG_IOASID)
void ioasid_install_capacity(ioasid_t total);
int ioasid_reserve_capacity(ioasid_t nr_ioasid);
int ioasid_cancel_capacity(ioasid_t nr_ioasid);
+struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t quota, int type);
+int ioasid_set_free(struct ioasid_set *set);
+struct ioasid_set *ioasid_find_mm_set(struct mm_struct *token);
ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
void *private);
-void ioasid_get(ioasid_t ioasid);
-bool ioasid_put(ioasid_t ioasid);
+int ioasid_get(struct ioasid_set *set, ioasid_t ioasid);
+int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid);
+bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid);
+bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid);
void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
bool (*getter)(void *));
int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
@@ -67,11 +99,33 @@ static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
return INVALID_IOASID;
}
-static inline void ioasid_get(ioasid_t ioasid)
+static inline struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t quota,
+ ioasid_set_type type)
{
+ return ERR_PTR(-ENOTSUPP);
+}
+
+static inline struct ioasid_set *ioasid_find_mm_set(struct mm_struct *token)
+{
+ return NULL;
+}
+
+static inline int ioasid_get(struct ioasid_set *set, ioasid_t ioasid)
+{
+ return -ENOTSUPP;
+}
+
+static inline int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+ return -ENOTSUPP;
+}
+
+static inline bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid)
+{
+ return false;
}
-static inline bool ioasid_put(ioasid_t ioasid)
+static inline bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid)
{
return false;
}
--
2.25.1
Hi Jean,
Slightly off the title. As we are moving to use cgroup to limit PASID
allocations, it would be much simpler if we enforce on the current task.
However, iommu_sva_alloc_pasid() takes an mm_struct pointer as argument
which implies it can be something other the the current task mm. So far all
kernel callers use current task mm. Is there a use case for doing PASID
allocation on behalf of another mm? If not, can we remove the mm argument?
Thanks,
Jacob
> /**
> * iommu_sva_alloc_pasid - Allocate a PASID for the mm
> @@ -35,11 +44,11 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm,
> ioasid_t min, ioasid_t max) mutex_lock(&iommu_sva_lock);
> if (mm->pasid) {
> if (mm->pasid >= min && mm->pasid <= max)
> - ioasid_get(mm->pasid);
> + ioasid_get(iommu_sva_pasid, mm->pasid);
> else
> ret = -EOVERFLOW;
> } else {
> - pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
> + pasid = ioasid_alloc(iommu_sva_pasid, min, max, mm);
> if (pasid == INVALID_IOASID)
> ret = -ENOMEM;
Thanks,
Jacob
Hi Jacob,
On Thu, Mar 18, 2021 at 05:22:34PM -0700, Jacob Pan wrote:
> Hi Jean,
>
> Slightly off the title. As we are moving to use cgroup to limit PASID
> allocations, it would be much simpler if we enforce on the current task.
Yes I think we should do that. Is there a problem with charging the
process that does the PASID allocation even if the PASID indexes some
other mm?
> However, iommu_sva_alloc_pasid() takes an mm_struct pointer as argument
> which implies it can be something other the the current task mm. So far all
> kernel callers use current task mm. Is there a use case for doing PASID
> allocation on behalf of another mm? If not, can we remove the mm argument?
This would effectively remove the mm parameter from
iommu_sva_bind_device(). I'm not opposed to that, but reintroducing it
later will be difficult if IOMMU drivers start assuming that the bound mm
is from current.
Although there is no use for it at the moment (only two upstream users and
it looks like amdkfd always uses current too), I quite like the
client-server model where the privileged process does bind() and programs
the hardware queue on behalf of the client process.
Thanks,
Jean
On Fri, Mar 19, 2021 at 10:58:41AM +0100, Jean-Philippe Brucker wrote:
> Although there is no use for it at the moment (only two upstream users and
> it looks like amdkfd always uses current too), I quite like the
> client-server model where the privileged process does bind() and programs
> the hardware queue on behalf of the client process.
This creates a lot complexity, how do does process A get a secure
reference to B? How does it access the memory in B to setup the HW?
Why do we need separation anyhow? SVM devices are supposed to be
secure or they shouldn't do SVM.
Jason
On Fri, Mar 19, 2021 at 09:46:45AM -0300, Jason Gunthorpe wrote:
> On Fri, Mar 19, 2021 at 10:58:41AM +0100, Jean-Philippe Brucker wrote:
>
> > Although there is no use for it at the moment (only two upstream users and
> > it looks like amdkfd always uses current too), I quite like the
> > client-server model where the privileged process does bind() and programs
> > the hardware queue on behalf of the client process.
>
> This creates a lot complexity, how do does process A get a secure
> reference to B? How does it access the memory in B to setup the HW?
mm_access() for example, and passing addresses via IPC
> Why do we need separation anyhow? SVM devices are supposed to be
> secure or they shouldn't do SVM.
Right
Thanks,
Jean
On Fri, Mar 19, 2021 at 02:41:32PM +0100, Jean-Philippe Brucker wrote:
> On Fri, Mar 19, 2021 at 09:46:45AM -0300, Jason Gunthorpe wrote:
> > On Fri, Mar 19, 2021 at 10:58:41AM +0100, Jean-Philippe Brucker wrote:
> >
> > > Although there is no use for it at the moment (only two upstream users and
> > > it looks like amdkfd always uses current too), I quite like the
> > > client-server model where the privileged process does bind() and programs
> > > the hardware queue on behalf of the client process.
> >
> > This creates a lot complexity, how do does process A get a secure
> > reference to B? How does it access the memory in B to setup the HW?
>
> mm_access() for example, and passing addresses via IPC
I'd rather the source process establish its own PASID and then pass
the rights to use it to some other process via FD passing than try to
go the other way. There are lots of security questions with something
like mm_access.
Jason
Hi Jean-Philippe,
On Fri, 19 Mar 2021 10:58:41 +0100, Jean-Philippe Brucker
<[email protected]> wrote:
> > Slightly off the title. As we are moving to use cgroup to limit PASID
> > allocations, it would be much simpler if we enforce on the current
> > task.
>
> Yes I think we should do that. Is there a problem with charging the
> process that does the PASID allocation even if the PASID indexes some
> other mm?
Besides complexity, my second concern is that we are sharing the misc
cgroup controller with other resources that do not have such behavior.
Cgroup v2 also has unified hierarchy which also requires coherent behavior
among controllers.
Thanks,
Jacob
Hi Jason,
On Fri, 19 Mar 2021 10:54:32 -0300, Jason Gunthorpe <[email protected]> wrote:
> On Fri, Mar 19, 2021 at 02:41:32PM +0100, Jean-Philippe Brucker wrote:
> > On Fri, Mar 19, 2021 at 09:46:45AM -0300, Jason Gunthorpe wrote:
> > > On Fri, Mar 19, 2021 at 10:58:41AM +0100, Jean-Philippe Brucker wrote:
> > >
> > > > Although there is no use for it at the moment (only two upstream
> > > > users and it looks like amdkfd always uses current too), I quite
> > > > like the client-server model where the privileged process does
> > > > bind() and programs the hardware queue on behalf of the client
> > > > process.
> > >
> > > This creates a lot complexity, how do does process A get a secure
> > > reference to B? How does it access the memory in B to setup the HW?
> >
> > mm_access() for example, and passing addresses via IPC
>
> I'd rather the source process establish its own PASID and then pass
> the rights to use it to some other process via FD passing than try to
> go the other way. There are lots of security questions with something
> like mm_access.
>
Thank you all for the input, it sounds like we are OK to remove mm argument
from iommu_sva_bind_device() and iommu_sva_alloc_pasid() for now?
Let me try to summarize PASID allocation as below:
Interfaces | Usage | Limit | bind¹ |User visible
--------------------------------------------------------------------
/dev/ioasid² | G-SVA/IOVA | cgroup | No |Yes
--------------------------------------------------------------------
char dev³ | SVA | cgroup | Yes |No
--------------------------------------------------------------------
iommu driver | default PASID| no | No |No
--------------------------------------------------------------------
kernel | super SVA | no | yes |No
--------------------------------------------------------------------
¹ Allocated during SVA bind
² PASIDs allocated via /dev/ioasid are not bound to any mm. But its
ownership is assigned to the process that does the allocation.
³ Include uacce, other private device driver char dev such as idxd
Currently, the proposed /dev/ioasid interface does not map individual PASID
with an FD. The FD is at the ioasid_set granularity and bond to the current
mm. We could extend the IOCTLs to cover individual PASID-FD passing case
when use cases arise. Would this work?
Thanks,
Jacob
On Fri, Mar 19, 2021 at 11:22:21AM -0700, Jacob Pan wrote:
> Hi Jason,
>
> On Fri, 19 Mar 2021 10:54:32 -0300, Jason Gunthorpe <[email protected]> wrote:
>
> > On Fri, Mar 19, 2021 at 02:41:32PM +0100, Jean-Philippe Brucker wrote:
> > > On Fri, Mar 19, 2021 at 09:46:45AM -0300, Jason Gunthorpe wrote:
> > > > On Fri, Mar 19, 2021 at 10:58:41AM +0100, Jean-Philippe Brucker wrote:
> > > >
> > > > > Although there is no use for it at the moment (only two upstream
> > > > > users and it looks like amdkfd always uses current too), I quite
> > > > > like the client-server model where the privileged process does
> > > > > bind() and programs the hardware queue on behalf of the client
> > > > > process.
> > > >
> > > > This creates a lot complexity, how do does process A get a secure
> > > > reference to B? How does it access the memory in B to setup the HW?
> > >
> > > mm_access() for example, and passing addresses via IPC
> >
> > I'd rather the source process establish its own PASID and then pass
> > the rights to use it to some other process via FD passing than try to
> > go the other way. There are lots of security questions with something
> > like mm_access.
> >
>
> Thank you all for the input, it sounds like we are OK to remove mm argument
> from iommu_sva_bind_device() and iommu_sva_alloc_pasid() for now?
Fine by me. By the way the IDXD currently missues the bind API for
supervisor PASID, and the drvdata parameter isn't otherwise used. This
would be a good occasion to clean both. The new bind prototype could be:
struct iommu_sva *iommu_sva_bind_device(struct device *dev, int flags)
And a flag IOMMU_SVA_BIND_SUPERVISOR (not that I plan to implement it in
the SMMU, but I think we need to clean the current usage)
>
> Let me try to summarize PASID allocation as below:
>
> Interfaces | Usage | Limit | bind¹ |User visible
> --------------------------------------------------------------------
> /dev/ioasid² | G-SVA/IOVA | cgroup | No |Yes
> --------------------------------------------------------------------
> char dev³ | SVA | cgroup | Yes |No
> --------------------------------------------------------------------
> iommu driver | default PASID| no | No |No
Is this PASID #0?
> --------------------------------------------------------------------
> kernel | super SVA | no | yes |No
> --------------------------------------------------------------------
Also wondering about device driver allocating auxiliary domains for their
private use, to do iommu_map/unmap on private PASIDs (a clean replacement
to super SVA, for example). Would that go through the same path as
/dev/ioasid and use the cgroup of current task?
Thanks,
Jean
>
> ¹ Allocated during SVA bind
> ² PASIDs allocated via /dev/ioasid are not bound to any mm. But its
> ownership is assigned to the process that does the allocation.
> ³ Include uacce, other private device driver char dev such as idxd
>
> Currently, the proposed /dev/ioasid interface does not map individual PASID
> with an FD. The FD is at the ioasid_set granularity and bond to the current
> mm. We could extend the IOCTLs to cover individual PASID-FD passing case
> when use cases arise. Would this work?
>
> Thanks,
>
> Jacob
On Fri, Mar 19, 2021 at 11:22:21AM -0700, Jacob Pan wrote:
> Hi Jason,
>
> On Fri, 19 Mar 2021 10:54:32 -0300, Jason Gunthorpe <[email protected]> wrote:
>
> > On Fri, Mar 19, 2021 at 02:41:32PM +0100, Jean-Philippe Brucker wrote:
> > > On Fri, Mar 19, 2021 at 09:46:45AM -0300, Jason Gunthorpe wrote:
> > > > On Fri, Mar 19, 2021 at 10:58:41AM +0100, Jean-Philippe Brucker wrote:
> > > >
> > > > > Although there is no use for it at the moment (only two upstream
> > > > > users and it looks like amdkfd always uses current too), I quite
> > > > > like the client-server model where the privileged process does
> > > > > bind() and programs the hardware queue on behalf of the client
> > > > > process.
> > > >
> > > > This creates a lot complexity, how do does process A get a secure
> > > > reference to B? How does it access the memory in B to setup the HW?
> > >
> > > mm_access() for example, and passing addresses via IPC
> >
> > I'd rather the source process establish its own PASID and then pass
> > the rights to use it to some other process via FD passing than try to
> > go the other way. There are lots of security questions with something
> > like mm_access.
> >
>
> Thank you all for the input, it sounds like we are OK to remove mm argument
> from iommu_sva_bind_device() and iommu_sva_alloc_pasid() for now?
>
> Let me try to summarize PASID allocation as below:
>
> Interfaces | Usage | Limit | bind¹ |User visible
> /dev/ioasid² | G-SVA/IOVA | cgroup | No |Yes
> char dev³ | SVA | cgroup | Yes |No
> iommu driver | default PASID| no | No |No
> kernel | super SVA | no | yes |No
>
> ¹ Allocated during SVA bind
> ² PASIDs allocated via /dev/ioasid are not bound to any mm. But its
> ownership is assigned to the process that does the allocation.
What does "not bound to a mm" mean?
IMHO a use created PASID is either bound to a mm (current) at creation
time, or it will never be bound to a mm and its page table is under
user control via /dev/ioasid.
I thought the whole point of something like a /dev/ioasid was to get
away from each and every device creating its own PASID interface?
It maybe somewhat reasonable that some devices could have some easy
'make a SVA PASID on current' interface built in, but anything more
complicated should use /dev/ioasid, and anything consuming PASID
should also have an API to import and attach a PASID from /dev/ioasid.
> Currently, the proposed /dev/ioasid interface does not map individual PASID
> with an FD. The FD is at the ioasid_set granularity and bond to the current
> mm. We could extend the IOCTLs to cover individual PASID-FD passing case
> when use cases arise. Would this work?
Is it a good idea that the FD is per ioasid_set ? What is the set used
for?
Usually kernel interfaces work nicer with a one fd/one object model.
But even if it is a set, you could pass the set between co-operating
processes and the PASID can be created in the correct 'current'. But
there is all kinds of security questsions as soon as you start doing
anything like this - is there really a use case?
Jason
Hi Jean-Philippe,
On Mon, 22 Mar 2021 10:24:00 +0100, Jean-Philippe Brucker
<[email protected]> wrote:
> On Fri, Mar 19, 2021 at 11:22:21AM -0700, Jacob Pan wrote:
> > Hi Jason,
> >
> > On Fri, 19 Mar 2021 10:54:32 -0300, Jason Gunthorpe <[email protected]>
> > wrote:
> > > On Fri, Mar 19, 2021 at 02:41:32PM +0100, Jean-Philippe Brucker
> > > wrote:
> > > > On Fri, Mar 19, 2021 at 09:46:45AM -0300, Jason Gunthorpe wrote:
> > > > > On Fri, Mar 19, 2021 at 10:58:41AM +0100, Jean-Philippe Brucker
> > > > > wrote:
> > > > > > Although there is no use for it at the moment (only two upstream
> > > > > > users and it looks like amdkfd always uses current too), I quite
> > > > > > like the client-server model where the privileged process does
> > > > > > bind() and programs the hardware queue on behalf of the client
> > > > > > process.
> > > > >
> > > > > This creates a lot complexity, how do does process A get a secure
> > > > > reference to B? How does it access the memory in B to setup the
> > > > > HW?
> > > >
> > > > mm_access() for example, and passing addresses via IPC
> > >
> > > I'd rather the source process establish its own PASID and then pass
> > > the rights to use it to some other process via FD passing than try to
> > > go the other way. There are lots of security questions with something
> > > like mm_access.
> > >
> >
> > Thank you all for the input, it sounds like we are OK to remove mm
> > argument from iommu_sva_bind_device() and iommu_sva_alloc_pasid() for
> > now?
>
> Fine by me. By the way the IDXD currently missues the bind API for
> supervisor PASID, and the drvdata parameter isn't otherwise used. This
> would be a good occasion to clean both. The new bind prototype could be:
>
> struct iommu_sva *iommu_sva_bind_device(struct device *dev, int flags)
>
yes, we really just hijacked drvdata as flags, it would be cleaner to use
flags explicitly.
> And a flag IOMMU_SVA_BIND_SUPERVISOR (not that I plan to implement it in
> the SMMU, but I think we need to clean the current usage)
>
You mean move #define SVM_FLAG_SUPERVISOR_MODE out of Intel code to be a
generic flag in iommu-sva-lib.h called IOMMU_SVA_BIND_SUPERVISOR?
I agree if that is the proposal.
> >
> > Let me try to summarize PASID allocation as below:
> >
> > Interfaces | Usage | Limit | bind¹ |User visible
> > --------------------------------------------------------------------
> > /dev/ioasid² | G-SVA/IOVA | cgroup | No
> > |Yes
> > --------------------------------------------------------------------
> > char dev³ | SVA | cgroup | Yes |No
> > --------------------------------------------------------------------
> > iommu driver | default PASID| no | No |No
> >
>
> Is this PASID #0?
>
True for native case but not limited to PASID#0 for guest case. E.g. for
mdev assignment with guest IOVA, the guest PASID would #0, but the host aux
domain default PASID can be non-zero. Here I meant to include both cases.
> > --------------------------------------------------------------------
> > kernel | super SVA | no | yes |No
> > --------------------------------------------------------------------
>
> Also wondering about device driver allocating auxiliary domains for their
> private use, to do iommu_map/unmap on private PASIDs (a clean replacement
> to super SVA, for example). Would that go through the same path as
> /dev/ioasid and use the cgroup of current task?
>
For the in-kernel private use, I don't think we should restrict based on
cgroup, since there is no affinity to user processes. I also think the
PASID allocation should just use kernel API instead of /dev/ioasid. Why
would user space need to know the actual PASID # for device private domains?
Maybe I missed your idea?
> Thanks,
> Jean
>
> >
> > ¹ Allocated during SVA bind
> > ² PASIDs allocated via /dev/ioasid are not bound to any mm. But its
> > ownership is assigned to the process that does the allocation.
> > ³ Include uacce, other private device driver char dev such as idxd
> >
> > Currently, the proposed /dev/ioasid interface does not map individual
> > PASID with an FD. The FD is at the ioasid_set granularity and bond to
> > the current mm. We could extend the IOCTLs to cover individual PASID-FD
> > passing case when use cases arise. Would this work?
> >
> > Thanks,
> >
> > Jacob
Thanks,
Jacob
On Wed, Mar 24, 2021 at 10:02:46AM -0700, Jacob Pan wrote:
> > Also wondering about device driver allocating auxiliary domains for their
> > private use, to do iommu_map/unmap on private PASIDs (a clean replacement
> > to super SVA, for example). Would that go through the same path as
> > /dev/ioasid and use the cgroup of current task?
>
> For the in-kernel private use, I don't think we should restrict based on
> cgroup, since there is no affinity to user processes. I also think the
> PASID allocation should just use kernel API instead of /dev/ioasid. Why
> would user space need to know the actual PASID # for device private domains?
> Maybe I missed your idea?
There is not much in the kernel that isn't triggered by a process, I
would be careful about the idea that there is a class of users that
can consume a cgroup controlled resource without being inside the
cgroup.
We've got into trouble before overlooking this and with something
greenfield like PASID it would be best built in to the API to prevent
a mistake. eg accepting a cgroup or process input to the allocator.
Jason
Hi Jason,
On Mon, 22 Mar 2021 09:03:00 -0300, Jason Gunthorpe <[email protected]> wrote:
> On Fri, Mar 19, 2021 at 11:22:21AM -0700, Jacob Pan wrote:
> > Hi Jason,
> >
> > On Fri, 19 Mar 2021 10:54:32 -0300, Jason Gunthorpe <[email protected]>
> > wrote:
> > > On Fri, Mar 19, 2021 at 02:41:32PM +0100, Jean-Philippe Brucker
> > > wrote:
> > > > On Fri, Mar 19, 2021 at 09:46:45AM -0300, Jason Gunthorpe wrote:
> > > > > On Fri, Mar 19, 2021 at 10:58:41AM +0100, Jean-Philippe Brucker
> > > > > wrote:
> > > > > > Although there is no use for it at the moment (only two upstream
> > > > > > users and it looks like amdkfd always uses current too), I quite
> > > > > > like the client-server model where the privileged process does
> > > > > > bind() and programs the hardware queue on behalf of the client
> > > > > > process.
> > > > >
> > > > > This creates a lot complexity, how do does process A get a secure
> > > > > reference to B? How does it access the memory in B to setup the
> > > > > HW?
> > > >
> > > > mm_access() for example, and passing addresses via IPC
> > >
> > > I'd rather the source process establish its own PASID and then pass
> > > the rights to use it to some other process via FD passing than try to
> > > go the other way. There are lots of security questions with something
> > > like mm_access.
> > >
> >
> > Thank you all for the input, it sounds like we are OK to remove mm
> > argument from iommu_sva_bind_device() and iommu_sva_alloc_pasid() for
> > now?
> >
> > Let me try to summarize PASID allocation as below:
> >
> > Interfaces | Usage | Limit | bind¹ |User visible
> > /dev/ioasid² | G-SVA/IOVA | cgroup | No
> > |Yes char dev³ | SVA | cgroup |
> > Yes |No iommu driver | default PASID| no
> > | No |No kernel | super SVA | no
> > | yes |No
> >
> > ¹ Allocated during SVA bind
> > ² PASIDs allocated via /dev/ioasid are not bound to any mm. But its
> > ownership is assigned to the process that does the allocation.
>
> What does "not bound to a mm" mean?
>
I meant, the IOASID allocated via /dev/ioasid is in a clean state (just a
number). It's initial state is not bound to an mm. Unlike, sva_bind_device()
where the IOASID is allocated during bind time.
The use case is to support guest SVA bind, where allocation and bind are in
two separate steps.
> IMHO a use created PASID is either bound to a mm (current) at creation
> time, or it will never be bound to a mm and its page table is under
> user control via /dev/ioasid.
>
True for PASID used in native SVA bind. But for binding with a guest mm,
PASID is allocated first (VT-d virtual cmd interface Spec 10.4.44), the
bind with the host IOMMU when vIOMMU PASID cache is invalidated.
Our intention is to have two separate interfaces:
1. /dev/ioasid (allocation/free only)
2. /dev/sva (handles all SVA related activities including page tables)
> I thought the whole point of something like a /dev/ioasid was to get
> away from each and every device creating its own PASID interface?
>
yes, but only for the use cases that need to expose PASID to the userspace.
AFAICT, the cases are:
1. guest SVA (bind guest mm)
2. full PF/VF assignment(not mediated) where guest driver want to program
the actual PASID onto the device.
> It maybe somewhat reasonable that some devices could have some easy
> 'make a SVA PASID on current' interface built in,
I agree, this is the case PASID is hidden from the userspace, right? e.g.
uacce.
> but anything more
> complicated should use /dev/ioasid, and anything consuming PASID
> should also have an API to import and attach a PASID from /dev/ioasid.
>
Would the above two use cases constitute the "complicated" criteria? Or we
should say anything that need the explicit PASID value has to through
/dev/ioasid?
Could you give some highlevel hint on the APIs that hook up IOASID
allocated from /dev/ioasid and use cases that combine device and domain
information? Yi is working on /dev/sva RFC, it would be good to have a
direction check.
> > Currently, the proposed /dev/ioasid interface does not map individual
> > PASID with an FD. The FD is at the ioasid_set granularity and bond to
> > the current mm. We could extend the IOCTLs to cover individual PASID-FD
> > passing case when use cases arise. Would this work?
>
> Is it a good idea that the FD is per ioasid_set ?
We were thinking the allocation IOCTL is on a per set basis, then we know
the ownership of between PASIDs and its set. If per PASID FD is needed, we
can extend.
> What is the set used
> for?
>
I tried to document the concept in
https://lore.kernel.org/lkml/[email protected]/
In terms of usage for guest SVA, an ioasid_set is mostly tied to a host mm,
the use case is as the following:
1. Identify a pool of PASIDs for permission checking (below to the same VM),
e.g. only allow SVA binding for PASIDs allocated from the same set.
2. Allow different PASID-aware kernel subsystems to associate, e.g. KVM,
device drivers, and IOMMU driver. i.e. each KVM instance only cares about
the ioasid_set associated with the VM. Events notifications are also within
the ioasid_set to synchronize PASID states.
3. Guest-Host PASID look up (each set has its own XArray to store the
mapping)
4. Quota control (going away once we have cgroup)
> Usually kernel interfaces work nicer with a one fd/one object model.
>
> But even if it is a set, you could pass the set between co-operating
> processes and the PASID can be created in the correct 'current'. But
> there is all kinds of security questsions as soon as you start doing
> anything like this - is there really a use case?
>
We don't see a use case for passing ioasid_set to another process. All the
four use cases above are for the current process.
> Jason
Thanks,
Jacob
Hi Jason,
On Wed, 24 Mar 2021 14:03:38 -0300, Jason Gunthorpe <[email protected]> wrote:
> On Wed, Mar 24, 2021 at 10:02:46AM -0700, Jacob Pan wrote:
> > > Also wondering about device driver allocating auxiliary domains for
> > > their private use, to do iommu_map/unmap on private PASIDs (a clean
> > > replacement to super SVA, for example). Would that go through the
> > > same path as /dev/ioasid and use the cgroup of current task?
> >
> > For the in-kernel private use, I don't think we should restrict based on
> > cgroup, since there is no affinity to user processes. I also think the
> > PASID allocation should just use kernel API instead of /dev/ioasid. Why
> > would user space need to know the actual PASID # for device private
> > domains? Maybe I missed your idea?
>
> There is not much in the kernel that isn't triggered by a process, I
> would be careful about the idea that there is a class of users that
> can consume a cgroup controlled resource without being inside the
> cgroup.
>
> We've got into trouble before overlooking this and with something
> greenfield like PASID it would be best built in to the API to prevent
> a mistake. eg accepting a cgroup or process input to the allocator.
>
Make sense. But I think we only allow charging the current cgroup, how about
I add the following to ioasid_alloc():
misc_cg = get_current_misc_cg();
ret = misc_cg_try_charge(MISC_CG_RES_IOASID, misc_cg, 1);
if (ret) {
put_misc_cg(misc_cg);
return ret;
}
BTW, IOASID will be one of the resources under the proposed misc cgroup.
Thanks,
Jacob
On Wed, Mar 24, 2021 at 03:12:30PM -0700, Jacob Pan wrote:
> Hi Jason,
>
> On Wed, 24 Mar 2021 14:03:38 -0300, Jason Gunthorpe <[email protected]> wrote:
>
> > On Wed, Mar 24, 2021 at 10:02:46AM -0700, Jacob Pan wrote:
> > > > Also wondering about device driver allocating auxiliary domains for
> > > > their private use, to do iommu_map/unmap on private PASIDs (a clean
> > > > replacement to super SVA, for example). Would that go through the
> > > > same path as /dev/ioasid and use the cgroup of current task?
> > >
> > > For the in-kernel private use, I don't think we should restrict based on
> > > cgroup, since there is no affinity to user processes. I also think the
> > > PASID allocation should just use kernel API instead of /dev/ioasid. Why
> > > would user space need to know the actual PASID # for device private
> > > domains? Maybe I missed your idea?
> >
> > There is not much in the kernel that isn't triggered by a process, I
> > would be careful about the idea that there is a class of users that
> > can consume a cgroup controlled resource without being inside the
> > cgroup.
> >
> > We've got into trouble before overlooking this and with something
> > greenfield like PASID it would be best built in to the API to prevent
> > a mistake. eg accepting a cgroup or process input to the allocator.
> >
> Make sense. But I think we only allow charging the current cgroup, how about
> I add the following to ioasid_alloc():
>
> misc_cg = get_current_misc_cg();
> ret = misc_cg_try_charge(MISC_CG_RES_IOASID, misc_cg, 1);
> if (ret) {
> put_misc_cg(misc_cg);
> return ret;
> }
Does that allow PASID allocation during driver probe, in kernel_init or
modprobe context?
Thanks,
Jean
On Wed, Mar 24, 2021 at 10:02:46AM -0700, Jacob Pan wrote:
> > And a flag IOMMU_SVA_BIND_SUPERVISOR (not that I plan to implement it in
> > the SMMU, but I think we need to clean the current usage)
> >
> You mean move #define SVM_FLAG_SUPERVISOR_MODE out of Intel code to be a
> generic flag in iommu-sva-lib.h called IOMMU_SVA_BIND_SUPERVISOR?
Yes, though it would need to be in iommu.h since it's used by device
drivers
> > Also wondering about device driver allocating auxiliary domains for their
> > private use, to do iommu_map/unmap on private PASIDs (a clean replacement
> > to super SVA, for example). Would that go through the same path as
> > /dev/ioasid and use the cgroup of current task?
> >
> For the in-kernel private use, I don't think we should restrict based on
> cgroup, since there is no affinity to user processes. I also think the
> PASID allocation should just use kernel API instead of /dev/ioasid. Why
> would user space need to know the actual PASID # for device private domains?
> Maybe I missed your idea?
No that's my bad, I didn't get the role of /dev/ioasid. Let me give the
series a proper read.
Thanks,
Jean
Hi Jean-Philippe,
On Thu, 25 Mar 2021 11:21:40 +0100, Jean-Philippe Brucker
<[email protected]> wrote:
> On Wed, Mar 24, 2021 at 03:12:30PM -0700, Jacob Pan wrote:
> > Hi Jason,
> >
> > On Wed, 24 Mar 2021 14:03:38 -0300, Jason Gunthorpe <[email protected]>
> > wrote:
> > > On Wed, Mar 24, 2021 at 10:02:46AM -0700, Jacob Pan wrote:
> > > > > Also wondering about device driver allocating auxiliary domains
> > > > > for their private use, to do iommu_map/unmap on private PASIDs (a
> > > > > clean replacement to super SVA, for example). Would that go
> > > > > through the same path as /dev/ioasid and use the cgroup of
> > > > > current task?
> > > >
> > > > For the in-kernel private use, I don't think we should restrict
> > > > based on cgroup, since there is no affinity to user processes. I
> > > > also think the PASID allocation should just use kernel API instead
> > > > of /dev/ioasid. Why would user space need to know the actual PASID
> > > > # for device private domains? Maybe I missed your idea?
> > >
> > > There is not much in the kernel that isn't triggered by a process, I
> > > would be careful about the idea that there is a class of users that
> > > can consume a cgroup controlled resource without being inside the
> > > cgroup.
> > >
> > > We've got into trouble before overlooking this and with something
> > > greenfield like PASID it would be best built in to the API to prevent
> > > a mistake. eg accepting a cgroup or process input to the allocator.
> > >
> > Make sense. But I think we only allow charging the current cgroup, how
> > about I add the following to ioasid_alloc():
> >
> > misc_cg = get_current_misc_cg();
> > ret = misc_cg_try_charge(MISC_CG_RES_IOASID, misc_cg, 1);
> > if (ret) {
> > put_misc_cg(misc_cg);
> > return ret;
> > }
>
> Does that allow PASID allocation during driver probe, in kernel_init or
> modprobe context?
>
Good point. Yes, you can get cgroup subsystem state in kernel_init for
charging/uncharging. I would think module_init should work also since it is
after kernel_init. I have tried the following:
static int __ref kernel_init(void *unused)
{
int ret;
+ struct cgroup_subsys_state *css;
+ css = task_get_css(current, pids_cgrp_id);
But that would imply:
1. IOASID has to be built-in, not as module
2. IOASIDs charged on PID1/init would not subject to cgroup limit since it
will be in the root cgroup and we don't support migration nor will migrate.
Then it comes back to the question of why do we try to limit in-kernel
users per cgroup if we can't enforce these cases.
> Thanks,
> Jean
>
Thanks,
Jacob
On Thu, Mar 25, 2021 at 10:02:36AM -0700, Jacob Pan wrote:
> Hi Jean-Philippe,
>
> On Thu, 25 Mar 2021 11:21:40 +0100, Jean-Philippe Brucker
> <[email protected]> wrote:
>
> > On Wed, Mar 24, 2021 at 03:12:30PM -0700, Jacob Pan wrote:
> > > Hi Jason,
> > >
> > > On Wed, 24 Mar 2021 14:03:38 -0300, Jason Gunthorpe <[email protected]>
> > > wrote:
> > > > On Wed, Mar 24, 2021 at 10:02:46AM -0700, Jacob Pan wrote:
> > > > > > Also wondering about device driver allocating auxiliary domains
> > > > > > for their private use, to do iommu_map/unmap on private PASIDs (a
> > > > > > clean replacement to super SVA, for example). Would that go
> > > > > > through the same path as /dev/ioasid and use the cgroup of
> > > > > > current task?
> > > > >
> > > > > For the in-kernel private use, I don't think we should restrict
> > > > > based on cgroup, since there is no affinity to user processes. I
> > > > > also think the PASID allocation should just use kernel API instead
> > > > > of /dev/ioasid. Why would user space need to know the actual PASID
> > > > > # for device private domains? Maybe I missed your idea?
> > > >
> > > > There is not much in the kernel that isn't triggered by a process, I
> > > > would be careful about the idea that there is a class of users that
> > > > can consume a cgroup controlled resource without being inside the
> > > > cgroup.
> > > >
> > > > We've got into trouble before overlooking this and with something
> > > > greenfield like PASID it would be best built in to the API to prevent
> > > > a mistake. eg accepting a cgroup or process input to the allocator.
> > > >
> > > Make sense. But I think we only allow charging the current cgroup, how
> > > about I add the following to ioasid_alloc():
> > >
> > > misc_cg = get_current_misc_cg();
> > > ret = misc_cg_try_charge(MISC_CG_RES_IOASID, misc_cg, 1);
> > > if (ret) {
> > > put_misc_cg(misc_cg);
> > > return ret;
> > > }
> >
> > Does that allow PASID allocation during driver probe, in kernel_init or
> > modprobe context?
> >
> Good point. Yes, you can get cgroup subsystem state in kernel_init for
> charging/uncharging. I would think module_init should work also since it is
> after kernel_init. I have tried the following:
> static int __ref kernel_init(void *unused)
> {
> int ret;
> + struct cgroup_subsys_state *css;
> + css = task_get_css(current, pids_cgrp_id);
>
> But that would imply:
> 1. IOASID has to be built-in, not as module
> 2. IOASIDs charged on PID1/init would not subject to cgroup limit since it
> will be in the root cgroup and we don't support migration nor will migrate.
>
> Then it comes back to the question of why do we try to limit in-kernel
> users per cgroup if we can't enforce these cases.
Are these real use cases? Why would a driver binding to a device
create a single kernel pasid at bind time? Why wouldn't it use
untagged DMA?
When someone needs it they can rework it and explain why they are
doing something sane.
Jason
Hi Jason,
On Thu, 25 Mar 2021 14:16:45 -0300, Jason Gunthorpe <[email protected]> wrote:
> On Thu, Mar 25, 2021 at 10:02:36AM -0700, Jacob Pan wrote:
> > Hi Jean-Philippe,
> >
> > On Thu, 25 Mar 2021 11:21:40 +0100, Jean-Philippe Brucker
> > <[email protected]> wrote:
> >
> > > On Wed, Mar 24, 2021 at 03:12:30PM -0700, Jacob Pan wrote:
> > > > Hi Jason,
> > > >
> > > > On Wed, 24 Mar 2021 14:03:38 -0300, Jason Gunthorpe <[email protected]>
> > > > wrote:
> > > > > On Wed, Mar 24, 2021 at 10:02:46AM -0700, Jacob Pan wrote:
> > > > > > > Also wondering about device driver allocating auxiliary
> > > > > > > domains for their private use, to do iommu_map/unmap on
> > > > > > > private PASIDs (a clean replacement to super SVA, for
> > > > > > > example). Would that go through the same path as /dev/ioasid
> > > > > > > and use the cgroup of current task?
> > > > > >
> > > > > > For the in-kernel private use, I don't think we should restrict
> > > > > > based on cgroup, since there is no affinity to user processes. I
> > > > > > also think the PASID allocation should just use kernel API
> > > > > > instead of /dev/ioasid. Why would user space need to know the
> > > > > > actual PASID # for device private domains? Maybe I missed your
> > > > > > idea?
> > > > >
> > > > > There is not much in the kernel that isn't triggered by a
> > > > > process, I would be careful about the idea that there is a class
> > > > > of users that can consume a cgroup controlled resource without
> > > > > being inside the cgroup.
> > > > >
> > > > > We've got into trouble before overlooking this and with something
> > > > > greenfield like PASID it would be best built in to the API to
> > > > > prevent a mistake. eg accepting a cgroup or process input to the
> > > > > allocator.
> > > > Make sense. But I think we only allow charging the current cgroup,
> > > > how about I add the following to ioasid_alloc():
> > > >
> > > > misc_cg = get_current_misc_cg();
> > > > ret = misc_cg_try_charge(MISC_CG_RES_IOASID, misc_cg, 1);
> > > > if (ret) {
> > > > put_misc_cg(misc_cg);
> > > > return ret;
> > > > }
> > >
> > > Does that allow PASID allocation during driver probe, in kernel_init
> > > or modprobe context?
> > >
> > Good point. Yes, you can get cgroup subsystem state in kernel_init for
> > charging/uncharging. I would think module_init should work also since
> > it is after kernel_init. I have tried the following:
> > static int __ref kernel_init(void *unused)
> > {
> > int ret;
> > + struct cgroup_subsys_state *css;
> > + css = task_get_css(current, pids_cgrp_id);
> >
> > But that would imply:
> > 1. IOASID has to be built-in, not as module
> > 2. IOASIDs charged on PID1/init would not subject to cgroup limit since
> > it will be in the root cgroup and we don't support migration nor will
> > migrate.
> >
> > Then it comes back to the question of why do we try to limit in-kernel
> > users per cgroup if we can't enforce these cases.
>
> Are these real use cases? Why would a driver binding to a device
> create a single kernel pasid at bind time? Why wouldn't it use
> untagged DMA?
>
For VT-d, I don't see such use cases. All PASID allocations by the kernel
drivers has proper process context.
> When someone needs it they can rework it and explain why they are
> doing something sane.
>
Agreed.
> Jason
Thanks,
Jacob
On Thu, Mar 25, 2021 at 02:16:45PM -0300, Jason Gunthorpe wrote:
> On Thu, Mar 25, 2021 at 10:02:36AM -0700, Jacob Pan wrote:
> > Hi Jean-Philippe,
> >
> > On Thu, 25 Mar 2021 11:21:40 +0100, Jean-Philippe Brucker
> > <[email protected]> wrote:
> >
> > > On Wed, Mar 24, 2021 at 03:12:30PM -0700, Jacob Pan wrote:
> > > > Hi Jason,
> > > >
> > > > On Wed, 24 Mar 2021 14:03:38 -0300, Jason Gunthorpe <[email protected]>
> > > > wrote:
> > > > > On Wed, Mar 24, 2021 at 10:02:46AM -0700, Jacob Pan wrote:
> > > > > > > Also wondering about device driver allocating auxiliary domains
> > > > > > > for their private use, to do iommu_map/unmap on private PASIDs (a
> > > > > > > clean replacement to super SVA, for example). Would that go
> > > > > > > through the same path as /dev/ioasid and use the cgroup of
> > > > > > > current task?
> > > > > >
> > > > > > For the in-kernel private use, I don't think we should restrict
> > > > > > based on cgroup, since there is no affinity to user processes. I
> > > > > > also think the PASID allocation should just use kernel API instead
> > > > > > of /dev/ioasid. Why would user space need to know the actual PASID
> > > > > > # for device private domains? Maybe I missed your idea?
> > > > >
> > > > > There is not much in the kernel that isn't triggered by a process, I
> > > > > would be careful about the idea that there is a class of users that
> > > > > can consume a cgroup controlled resource without being inside the
> > > > > cgroup.
> > > > >
> > > > > We've got into trouble before overlooking this and with something
> > > > > greenfield like PASID it would be best built in to the API to prevent
> > > > > a mistake. eg accepting a cgroup or process input to the allocator.
> > > > >
> > > > Make sense. But I think we only allow charging the current cgroup, how
> > > > about I add the following to ioasid_alloc():
> > > >
> > > > misc_cg = get_current_misc_cg();
> > > > ret = misc_cg_try_charge(MISC_CG_RES_IOASID, misc_cg, 1);
> > > > if (ret) {
> > > > put_misc_cg(misc_cg);
> > > > return ret;
> > > > }
> > >
> > > Does that allow PASID allocation during driver probe, in kernel_init or
> > > modprobe context?
> > >
> > Good point. Yes, you can get cgroup subsystem state in kernel_init for
> > charging/uncharging. I would think module_init should work also since it is
> > after kernel_init. I have tried the following:
> > static int __ref kernel_init(void *unused)
> > {
> > int ret;
> > + struct cgroup_subsys_state *css;
> > + css = task_get_css(current, pids_cgrp_id);
> >
> > But that would imply:
> > 1. IOASID has to be built-in, not as module
If IOASID is a module, the device driver will probe once the IOMMU module
is available, which I think always happens in probe deferral kworker.
> > 2. IOASIDs charged on PID1/init would not subject to cgroup limit since it
> > will be in the root cgroup and we don't support migration nor will migrate.
> >
> > Then it comes back to the question of why do we try to limit in-kernel
> > users per cgroup if we can't enforce these cases.
It may be better to explicitly pass a cgroup during allocation as Jason
suggested. That way anyone using the API will have to be aware of this and
pass the root cgroup if that's what they want.
> Are these real use cases? Why would a driver binding to a device
> create a single kernel pasid at bind time? Why wouldn't it use
> untagged DMA?
It's not inconceivable to have a control queue doing DMA tagged with
PASID. The devices I know either use untagged DMA, or have a choice to use
a PASID. We're not outright forbidding PASID allocation at boot (I don't
think we can or should) and we won't be able to check every use of the
API, so I'm trying to figure out whether it will always default to root
cgroup, or crash in some corner case.
Thanks,
Jean
On Wed, Mar 24, 2021 at 12:05:28PM -0700, Jacob Pan wrote:
> > IMHO a use created PASID is either bound to a mm (current) at creation
> > time, or it will never be bound to a mm and its page table is under
> > user control via /dev/ioasid.
> >
> True for PASID used in native SVA bind. But for binding with a guest mm,
> PASID is allocated first (VT-d virtual cmd interface Spec 10.4.44), the
> bind with the host IOMMU when vIOMMU PASID cache is invalidated.
>
> Our intention is to have two separate interfaces:
> 1. /dev/ioasid (allocation/free only)
> 2. /dev/sva (handles all SVA related activities including page tables)
I'm not sure I understand why you'd want to have two things. Doesn't
that just complicate everything?
Manipulating the ioasid, including filling it with page tables, seems
an integral inseperable part of the whole interface. Why have two ?
> > I thought the whole point of something like a /dev/ioasid was to get
> > away from each and every device creating its own PASID interface?
> >
> yes, but only for the use cases that need to expose PASID to the
> userspace.
Why "but only"? This thing should reach for a higher generality, not
just be contained to solve some problem within qemu.
> > It maybe somewhat reasonable that some devices could have some easy
> > 'make a SVA PASID on current' interface built in,
> I agree, this is the case PASID is hidden from the userspace, right? e.g.
> uacce.
"hidden", I guess, but does it matter so much?
The PASID would still consume a cgroup credit
> > but anything more
> > complicated should use /dev/ioasid, and anything consuming PASID
> > should also have an API to import and attach a PASID from /dev/ioasid.
> >
> Would the above two use cases constitute the "complicated" criteria? Or we
> should say anything that need the explicit PASID value has to through
> /dev/ioasid?
Anything that needs more that creating a hidden PASID link'd to
current should use the full interface.
> In terms of usage for guest SVA, an ioasid_set is mostly tied to a host mm,
> the use case is as the following:
From that doc:
It is imperative to enforce
VM-IOASID ownership such that a malicious guest cannot target DMA
traffic outside its own IOASIDs, or free an active IOASID that belongs
to another VM.
Huh?
Security in a PASID world comes from the IOMMU blocking access to the
PASID except from approved PCI-ID's. If a VF/PF is assigned to a guest
then that guest can cause the device to issue any PASID by having
complete control and the vIOMMU is supposed to tell the real IOMMU
what PASID's the device is alowed to access.
If a device is sharing a single PCI function with different security
contexts (eg vfio mdev) then the device itself is responsible to
ensure that only the secure interface can program a PASID and a less
secure context can never self-enroll.
Here the mdev driver would have to consule with the vIOMMU to ensure
the mdev device is allowed to access the PASID - is that what this
set stuff is about?
If yes, it is backwards. The MDEV is the thing doing the security, the
MDEV should have the list of allowed PASID's and a single PASID
created under /dev/ioasid should be loaded into MDEV with some 'Ok you
can use PASID xyz from FD abc' command.
Because you absolutely don't want to have a generic 'set' that all the
mdevs are sharing as that violates the basic security principle at the
start - each and every device must have a unique list of what PASID's
it can talk to.
> 1. Identify a pool of PASIDs for permission checking (below to the same VM),
> e.g. only allow SVA binding for PASIDs allocated from the same set.
>
> 2. Allow different PASID-aware kernel subsystems to associate, e.g. KVM,
> device drivers, and IOMMU driver. i.e. each KVM instance only cares about
> the ioasid_set associated with the VM. Events notifications are also within
> the ioasid_set to synchronize PASID states.
>
> 3. Guest-Host PASID look up (each set has its own XArray to store the
> mapping)
>
> 4. Quota control (going away once we have cgroup)
It sounds worrysome things have gone this way.
I'd say you shoul have a single /dev/ioasid per VM and KVM should
attach to that - it should get all the global events/etc that are not
device specific.
permission checking *must* be done on a per-device level, either inside the
mdev driver, or inside the IOMMU at a per-PCI device level.
Not sure what guest-host PASID means, these have to be 1:1 for device
assignment to work - why would use something else for mdev?
Jason
Hi Jason,
On Mon, 29 Mar 2021 13:31:47 -0300, Jason Gunthorpe <[email protected]> wrote:
> On Wed, Mar 24, 2021 at 12:05:28PM -0700, Jacob Pan wrote:
>
> > > IMHO a use created PASID is either bound to a mm (current) at creation
> > > time, or it will never be bound to a mm and its page table is under
> > > user control via /dev/ioasid.
> > >
> > True for PASID used in native SVA bind. But for binding with a guest mm,
> > PASID is allocated first (VT-d virtual cmd interface Spec 10.4.44), the
> > bind with the host IOMMU when vIOMMU PASID cache is invalidated.
> >
> > Our intention is to have two separate interfaces:
> > 1. /dev/ioasid (allocation/free only)
> > 2. /dev/sva (handles all SVA related activities including page tables)
>
> I'm not sure I understand why you'd want to have two things. Doesn't
> that just complicate everything?
>
> Manipulating the ioasid, including filling it with page tables, seems
> an integral inseperable part of the whole interface. Why have two ?
>
In one of the earlier discussions, I was made aware of some use cases (by
AMD, iirc) where PASID can be used w/o IOMMU. That is why I tried to keep
ioasid a separate subsystem. Other than that, I don't see an issue
combining the two.
> > > I thought the whole point of something like a /dev/ioasid was to get
> > > away from each and every device creating its own PASID interface?
> > >
> > yes, but only for the use cases that need to expose PASID to the
> > userspace.
>
> Why "but only"? This thing should reach for a higher generality, not
> just be contained to solve some problem within qemu.
>
I totally agree in terms of generality. I was just trying to point out
existing framework or drivers such as uacce and idxd driver does not have a
need to use /dev/ioasid.
> > > It maybe somewhat reasonable that some devices could have some easy
> > > 'make a SVA PASID on current' interface built in,
> > I agree, this is the case PASID is hidden from the userspace, right?
> > e.g. uacce.
>
> "hidden", I guess, but does it matter so much?
>
it matters when it comes to which interface to choose. Use /dev/ioasid to
allocate if PASID value cannot be hidden. Use some other interface for bind
current and allocate if a PASID is not visible to the user.
> The PASID would still consume a cgroup credit
>
yes, credit still consumed. Just the PASID value is hidden.
> > > but anything more
> > > complicated should use /dev/ioasid, and anything consuming PASID
> > > should also have an API to import and attach a PASID from /dev/ioasid.
> > >
> > Would the above two use cases constitute the "complicated" criteria? Or
> > we should say anything that need the explicit PASID value has to through
> > /dev/ioasid?
>
> Anything that needs more that creating a hidden PASID link'd to
> current should use the full interface.
>
Yes, I think we are on the same page. For example, today's uacce or idxd
driver creates a hidden PASID when user does open(), where a new WQ is
provisioned and bound to current mm. This is the case where /dev/ioasid is
not needed.
> > In terms of usage for guest SVA, an ioasid_set is mostly tied to a host
> > mm, the use case is as the following:
>
> From that doc:
>
> It is imperative to enforce
> VM-IOASID ownership such that a malicious guest cannot target DMA
> traffic outside its own IOASIDs, or free an active IOASID that belongs
> to another VM.
>
> Huh?
>
Sorry, I am not following. In the doc, I have an example to show the
ioasid_set to VM/mm mapping. We use mm as the ioasid_set token to identify
who the owner of an IOASID is. i.e. who allocated the IOASID. Non-owner
cannot perform bind page table or free operations.
Section: IOASID Set Private ID (SPID)
.------------------. .------------------.
| VM 1 | | VM 2 |
| | | |
|------------------| |------------------|
| GPASID/SPID 101 | | GPASID/SPID 101 |
'------------------' -------------------' Guest
__________|______________________|____________________
| | Host
v v
.------------------. .------------------.
| Host IOASID 201 | | Host IOASID 202 |
'------------------' '------------------'
| IOASID set 1 | | IOASID set 2 |
'------------------' '------------------'
> Security in a PASID world comes from the IOMMU blocking access to the
> PASID except from approved PCI-ID's. If a VF/PF is assigned to a guest
> then that guest can cause the device to issue any PASID by having
> complete control and the vIOMMU is supposed to tell the real IOMMU
> what PASID's the device is alowed to access.
>
Yes, each PF/VF has its own PASID table. The device can do whatever
it wants as long as the PASID is present in the table. Programming of the
pIOMMU PASID table entry, however, is controlled by the host.
IMHO, there are two levels of security here:
1. A PASID can only be used by a secure context
2. A device can only use allowed PASIDs (PASID namespace is system-wide but
PASID table storage is per PF/VF)
IOASID set is designed for #1.
> If a device is sharing a single PCI function with different security
> contexts (eg vfio mdev) then the device itself is responsible to
> ensure that only the secure interface can program a PASID and a less
> secure context can never self-enroll.
>
If two mdevs from the same PF dev are assigned to two VMs, the PASID
table will be shared. IOASID set ensures one VM cannot program another VM's
PASIDs. I assume 'secure context' is per VM when it comes to host PASID.
> Here the mdev driver would have to consule with the vIOMMU to ensure
> the mdev device is allowed to access the PASID - is that what this
> set stuff is about?
>
No. the mdev driver consults with IOASID core When the guest programs a
guest PASID on to he mdev. VDCM driver does a lookup:
host_pasid = ioasid_find_by_spid(ioasid_set, guest_pasid);
If the guest_pasid does not exist in the ioasid_set, the mdev programming
fails; if the guest_pasid does exist but it maps to a wrong host PASID, the
damage is limited to the guest itself.
> If yes, it is backwards. The MDEV is the thing doing the security, the
> MDEV should have the list of allowed PASID's and a single PASID
> created under /dev/ioasid should be loaded into MDEV with some 'Ok you
> can use PASID xyz from FD abc' command.
>
I guess that is not the case. For VT-d dedicated WQ, there is only one
PASID can be programmed onto the device. Programming the PASID with
/dev/sva FD abc command will be checked against its mm where /dev/ioasid is
used to do the allocation.
For a single shared WQ assigned to multiple VMs, there will be one mdev per
VM. Again, FD commands is limited to the PASIDs allocated for the VM.
For a single share WQ assigned to one VM, it can be bound to multiple guest
processes/PASIDs. Host IOMMU driver maintains a list of the PASIDs and
ensures that they are only programmed on to the per device PASID table.
> Because you absolutely don't want to have a generic 'set' that all the
> mdevs are sharing as that violates the basic security principle at the
> start - each and every device must have a unique list of what PASID's
> it can talk to.
>
I agree, I don't think this is the case. The ioasid_set is some what
orthogonal to mdev collections.
> > 1. Identify a pool of PASIDs for permission checking (below to the same
> > VM), e.g. only allow SVA binding for PASIDs allocated from the same set.
> >
> > 2. Allow different PASID-aware kernel subsystems to associate, e.g. KVM,
> > device drivers, and IOMMU driver. i.e. each KVM instance only cares
> > about the ioasid_set associated with the VM. Events notifications are
> > also within the ioasid_set to synchronize PASID states.
> >
> > 3. Guest-Host PASID look up (each set has its own XArray to store the
> > mapping)
> >
> > 4. Quota control (going away once we have cgroup)
>
> It sounds worrysome things have gone this way.
>
Could you expand on that? Guaranteeing quota is very difficult. cgroup
limit model fits most scalar resources.
> I'd say you shoul have a single /dev/ioasid per VM and KVM should
> attach to that - it should get all the global events/etc that are not
> device specific.
>
You mean a single /dev/ioasid FD per VM and KVM? I think that is what we
are doing in this set. A VM process can only open /dev/ioasid once, then
use the FD for allocation and pass the PASID for bind page table etc.
> permission checking *must* be done on a per-device level, either inside
> the mdev driver, or inside the IOMMU at a per-PCI device level.
>
I think we are on the same page. For mdev, VDCM driver makes sure the guest
PASID programmed is allocated by the same VM that also performed the bind SVA.
For PF/VF which is not mediated, the permission is implied by the IOMMU
driver/HW since PASID table is per device.
> Not sure what guest-host PASID means, these have to be 1:1 for device
> assignment to work - why would use something else for mdev?
>
We have G-H PASID translation. They don't have to be 1:1.
IOASID Set Private ID (SPID) is intended as a generic solution for guest PASID.
Could you review the secion Section: IOASID Set Private ID (SPID) in the
doc patch?
We also had some slides from last year. Slide 3s-6 mostly.
https://static.sched.com/hosted_files/kvmforum2020/9f/KVM_forum_2020_PASID_MGMT_Yi_Jacob_final.pdf
Really appreciated your time!
Jacob
> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, March 30, 2021 12:32 AM
>
> On Wed, Mar 24, 2021 at 12:05:28PM -0700, Jacob Pan wrote:
>
> > > IMHO a use created PASID is either bound to a mm (current) at creation
> > > time, or it will never be bound to a mm and its page table is under
> > > user control via /dev/ioasid.
> > >
> > True for PASID used in native SVA bind. But for binding with a guest mm,
> > PASID is allocated first (VT-d virtual cmd interface Spec 10.4.44), the
> > bind with the host IOMMU when vIOMMU PASID cache is invalidated.
> >
> > Our intention is to have two separate interfaces:
> > 1. /dev/ioasid (allocation/free only)
> > 2. /dev/sva (handles all SVA related activities including page tables)
>
> I'm not sure I understand why you'd want to have two things. Doesn't
> that just complicate everything?
>
> Manipulating the ioasid, including filling it with page tables, seems
> an integral inseperable part of the whole interface. Why have two ?
Hi, Jason,
Actually above is a major open while we are refactoring vSVA uAPI toward
this direction. We have two concerns about merging /dev/ioasid with
/dev/sva, and would like to hear your thought whether they are valid.
First, userspace may use ioasid in a non-SVA scenario where ioasid is
bound to specific security context (e.g. a control vq in vDPA) instead of
tying to mm. In this case there is no pgtable binding initiated from user
space. Instead, ioasid is allocated from /dev/ioasid and then programmed
to the intended security context through specific passthrough framework
which manages that context.
Second, ioasid is managed per process/VM while pgtable binding is a
device-wise operation. The userspace flow looks like below for an integral
/dev/ioasid interface:
-----------initialization----------
- ioctl(container->fd, VFIO_SET_IOMMU, VFIO_TYPE1_NESTING_IOMMU)
- ioasid_fd = open(/dev/ioasid)
- ioctl(ioasid_fd, IOASID_GET_USVA_FD, &sva_fd) //an empty context
- ioctl(device->fd, VFIO_DEVICE_SET_SVA, &sva_fd); //sva_fd ties to device
- ioctl(sva_fd, USVA_GET_INFO, &sva_info);
-----------runtime----------------
- ioctl(ioasid_fd, IOMMU_ALLOC_IOASID, &ioasid);
- ioctl(sva_fd, USVA_BIND_PGTBL, &bind_data);
- ioctl(sva_fd, USVA_FLUSH_CACHE, &inv_info);
- ioctl(sva_fd, USVA_UNBIND_PGTBL, &unbind_data);
-----------destroy----------------
- ioctl(device->fd, VFIO_DEVICE_UNSET_SVA, &sva_fd);
- close(sva_fd)
- close(ioasid_fd)
Our hesitation here is based on one of your earlier comments that
you are not a fan of constructing fd's through ioctl. Are you OK with
above flow or have a better idea of handling it?
With separate interfaces then userspace just opens /dev/sva instead
of getting it through ioasid_fd:
- ioasid_fd = open(/dev/ioasid)
- sva_fd = open(/dev/sva)
Thanks
Kevin
> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, March 30, 2021 12:32 AM
> > In terms of usage for guest SVA, an ioasid_set is mostly tied to a host mm,
> > the use case is as the following:
>
> From that doc:
>
> It is imperative to enforce
> VM-IOASID ownership such that a malicious guest cannot target DMA
> traffic outside its own IOASIDs, or free an active IOASID that belongs
> to another VM.
>
> Huh?
>
> Security in a PASID world comes from the IOMMU blocking access to the
> PASID except from approved PCI-ID's. If a VF/PF is assigned to a guest
> then that guest can cause the device to issue any PASID by having
> complete control and the vIOMMU is supposed to tell the real IOMMU
> what PASID's the device is alowed to access.
>
> If a device is sharing a single PCI function with different security
> contexts (eg vfio mdev) then the device itself is responsible to
> ensure that only the secure interface can program a PASID and a less
> secure context can never self-enroll.
>
> Here the mdev driver would have to consule with the vIOMMU to ensure
> the mdev device is allowed to access the PASID - is that what this
> set stuff is about?
>
> If yes, it is backwards. The MDEV is the thing doing the security, the
> MDEV should have the list of allowed PASID's and a single PASID
> created under /dev/ioasid should be loaded into MDEV with some 'Ok you
> can use PASID xyz from FD abc' command.
>
The 'set' is per-VM. Once the mdev is assigned to a VM, all valid PASID's
in the set of that VM are considered legitimate on this mdev. The mdev
driver will mediate guest operations which program PASID to the backend
context and load the PASID only if it is within the 'set' (i.e. already
allocated through /dev/ioasid). This prevents a malicious VM from attacking
others. Though it's not mdev which directly maintaining the list of allowed
PASID's, the effect is the same in concept.
Thanks
Kevin
> From: Tian, Kevin
> Sent: Tuesday, March 30, 2021 10:24 AM
>
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Tuesday, March 30, 2021 12:32 AM
> > > In terms of usage for guest SVA, an ioasid_set is mostly tied to a host mm,
> > > the use case is as the following:
> >
> > From that doc:
> >
> > It is imperative to enforce
> > VM-IOASID ownership such that a malicious guest cannot target DMA
> > traffic outside its own IOASIDs, or free an active IOASID that belongs
> > to another VM.
> >
> > Huh?
> >
> > Security in a PASID world comes from the IOMMU blocking access to the
> > PASID except from approved PCI-ID's. If a VF/PF is assigned to a guest
> > then that guest can cause the device to issue any PASID by having
> > complete control and the vIOMMU is supposed to tell the real IOMMU
> > what PASID's the device is alowed to access.
> >
> > If a device is sharing a single PCI function with different security
> > contexts (eg vfio mdev) then the device itself is responsible to
> > ensure that only the secure interface can program a PASID and a less
> > secure context can never self-enroll.
> >
> > Here the mdev driver would have to consule with the vIOMMU to ensure
> > the mdev device is allowed to access the PASID - is that what this
> > set stuff is about?
> >
> > If yes, it is backwards. The MDEV is the thing doing the security, the
> > MDEV should have the list of allowed PASID's and a single PASID
> > created under /dev/ioasid should be loaded into MDEV with some 'Ok you
> > can use PASID xyz from FD abc' command.
> >
>
> The 'set' is per-VM. Once the mdev is assigned to a VM, all valid PASID's
> in the set of that VM are considered legitimate on this mdev. The mdev
> driver will mediate guest operations which program PASID to the backend
> context and load the PASID only if it is within the 'set' (i.e. already
> allocated through /dev/ioasid). This prevents a malicious VM from attacking
> others. Though it's not mdev which directly maintaining the list of allowed
> PASID's, the effect is the same in concept.
>
One correction. The mdev should still construct the list of allowed PASID's as
you said (by listening to IOASID_BIND/UNBIND event), in addition to the ioasid
set maintained per VM (updated when a PASID is allocated/freed). The per-VM
set is required for inter-VM isolation (verified when a pgtable is bound to the
mdev/PASID), while the mdev's own list is necessary for intra-VM isolation when
multiple mdevs are assigned to the same VM (verified before loading a PASID
to the mdev). This series just handles the general part i.e. per-VM ioasid set and
leaves the mdev's own list to be managed by specific mdev driver which listens
to various IOASID events).
Thanks
Kevin
On Fri, Mar 26, 2021 at 09:06:42AM +0100, Jean-Philippe Brucker wrote:
> It's not inconceivable to have a control queue doing DMA tagged with
> PASID. The devices I know either use untagged DMA, or have a choice to use
> a PASID.
I don't think we should encourage that. A PASID and all the related is
so expensive compared to just doing normal untagged kernel DMA.
I assume HW has these features because virtualization use cases might
use them, eg by using mdev to assign a command queue - then it would
need be be contained by a PASID.
Jason
On Tue, Mar 30, 2021 at 02:24:09AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Tuesday, March 30, 2021 12:32 AM
> > > In terms of usage for guest SVA, an ioasid_set is mostly tied to a host mm,
> > > the use case is as the following:
> >
> > From that doc:
> >
> > It is imperative to enforce
> > VM-IOASID ownership such that a malicious guest cannot target DMA
> > traffic outside its own IOASIDs, or free an active IOASID that belongs
> > to another VM.
> >
> > Huh?
> >
> > Security in a PASID world comes from the IOMMU blocking access to the
> > PASID except from approved PCI-ID's. If a VF/PF is assigned to a guest
> > then that guest can cause the device to issue any PASID by having
> > complete control and the vIOMMU is supposed to tell the real IOMMU
> > what PASID's the device is alowed to access.
> >
> > If a device is sharing a single PCI function with different security
> > contexts (eg vfio mdev) then the device itself is responsible to
> > ensure that only the secure interface can program a PASID and a less
> > secure context can never self-enroll.
> >
> > Here the mdev driver would have to consule with the vIOMMU to ensure
> > the mdev device is allowed to access the PASID - is that what this
> > set stuff is about?
> >
> > If yes, it is backwards. The MDEV is the thing doing the security, the
> > MDEV should have the list of allowed PASID's and a single PASID
> > created under /dev/ioasid should be loaded into MDEV with some 'Ok you
> > can use PASID xyz from FD abc' command.
> >
>
> The 'set' is per-VM. Once the mdev is assigned to a VM, all valid PASID's
> in the set of that VM are considered legitimate on this mdev.
No! That is a major security problem!
PASID authorization is *PER DEVICE*.
If I map a device into VFIO in userspace with full control over the HW
that device MUST ONLY have access to PASID's that have been registered
with vfio.
This means each time you register a PASID vfio must tell the IOMMU
driver to authorize the pci_device to access the PASID, the vIOMMU
driver must tell the hypervisor and the mdev under the PCI device MUST
have a per-device list of allowed PASIDs.
Otherwise userspace in a VM with vfio could tell the mdev driver to
talk to a PASID in the same VM but *that process doesn't own*. This is
absolutely not allowed.
Most likely the entire ioasid set and related need to be deleted as a
kernel concept.
Jason
On Tue, Mar 30, 2021 at 01:37:05AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Tuesday, March 30, 2021 12:32 AM
> >
> > On Wed, Mar 24, 2021 at 12:05:28PM -0700, Jacob Pan wrote:
> >
> > > > IMHO a use created PASID is either bound to a mm (current) at creation
> > > > time, or it will never be bound to a mm and its page table is under
> > > > user control via /dev/ioasid.
> > > >
> > > True for PASID used in native SVA bind. But for binding with a guest mm,
> > > PASID is allocated first (VT-d virtual cmd interface Spec 10.4.44), the
> > > bind with the host IOMMU when vIOMMU PASID cache is invalidated.
> > >
> > > Our intention is to have two separate interfaces:
> > > 1. /dev/ioasid (allocation/free only)
> > > 2. /dev/sva (handles all SVA related activities including page tables)
> >
> > I'm not sure I understand why you'd want to have two things. Doesn't
> > that just complicate everything?
> >
> > Manipulating the ioasid, including filling it with page tables, seems
> > an integral inseperable part of the whole interface. Why have two ?
>
> Hi, Jason,
>
> Actually above is a major open while we are refactoring vSVA uAPI toward
> this direction. We have two concerns about merging /dev/ioasid with
> /dev/sva, and would like to hear your thought whether they are valid.
>
> First, userspace may use ioasid in a non-SVA scenario where ioasid is
> bound to specific security context (e.g. a control vq in vDPA) instead of
> tying to mm. In this case there is no pgtable binding initiated from user
> space. Instead, ioasid is allocated from /dev/ioasid and then programmed
> to the intended security context through specific passthrough framework
> which manages that context.
This sounds like the exact opposite of what I'd like to see.
I do not want to see every subsystem gaining APIs to program a
PASID. All of that should be consolidated in *one place*.
I do not want to see VDPA and VFIO have two nearly identical sets of
APIs to control the PASID.
Drivers consuming a PASID, like VDPA, should consume the PASID and do
nothing more than authorize the HW to use it.
quemu should have general code under the viommu driver that drives
/dev/ioasid to create PASID's and manage the IO mapping according to
the guest's needs.
Drivers like VDPA and VFIO should simply accept that PASID and
configure/authorize their HW to do DMA's with its tag.
> Second, ioasid is managed per process/VM while pgtable binding is a
> device-wise operation. The userspace flow looks like below for an integral
> /dev/ioasid interface:
>
> - ioctl(container->fd, VFIO_SET_IOMMU, VFIO_TYPE1_NESTING_IOMMU)
> - ioasid_fd = open(/dev/ioasid)
> - ioctl(ioasid_fd, IOASID_GET_USVA_FD, &sva_fd) //an empty context
> - ioctl(device->fd, VFIO_DEVICE_SET_SVA, &sva_fd); //sva_fd ties to device
> - ioctl(sva_fd, USVA_GET_INFO, &sva_info);
> - ioctl(ioasid_fd, IOMMU_ALLOC_IOASID, &ioasid);
> - ioctl(sva_fd, USVA_BIND_PGTBL, &bind_data);
> - ioctl(sva_fd, USVA_FLUSH_CACHE, &inv_info);
> - ioctl(sva_fd, USVA_UNBIND_PGTBL, &unbind_data);
> - ioctl(device->fd, VFIO_DEVICE_UNSET_SVA, &sva_fd);
> - close(sva_fd)
> - close(ioasid_fd)
>
> Our hesitation here is based on one of your earlier comments that
> you are not a fan of constructing fd's through ioctl. Are you OK with
> above flow or have a better idea of handling it?
My reaction is to squash 'sva' and ioasid fds together, I can't see
why you'd need two fds to manipulate a PASID.
DEVICE_SET_SVA seems like the wrong language too, it should be more
like DEVICE_ALLOW_IOASID which only tells the iommu and driver to alow
the pci_device to use the IOASID.
Jason
On Tue, Mar 30, 2021 at 04:14:58AM +0000, Tian, Kevin wrote:
> One correction. The mdev should still construct the list of allowed PASID's as
> you said (by listening to IOASID_BIND/UNBIND event), in addition to the ioasid
> set maintained per VM (updated when a PASID is allocated/freed). The per-VM
> set is required for inter-VM isolation (verified when a pgtable is bound to the
> mdev/PASID), while the mdev's own list is necessary for intra-VM isolation when
> multiple mdevs are assigned to the same VM (verified before loading a PASID
> to the mdev). This series just handles the general part i.e. per-VM ioasid set and
> leaves the mdev's own list to be managed by specific mdev driver which listens
> to various IOASID events).
This is better, but I don't understand why we need such a convoluted
design.
Get rid of the ioasid set.
Each driver has its own list of allowed ioasids.
Register a ioasid in the driver's list by passing the fd and ioasid #
No listening to events. A simple understandable security model.
Look - it took you three emails to even correctly explain the security
model you are striving for here, it is *obviously* too complicated for
anyone to understand or successfully implement. simplify smiplify
simplify.
Jason
On Tue, Mar 30, 2021 at 10:07:55AM -0300, Jason Gunthorpe wrote:
> On Fri, Mar 26, 2021 at 09:06:42AM +0100, Jean-Philippe Brucker wrote:
>
> > It's not inconceivable to have a control queue doing DMA tagged with
> > PASID. The devices I know either use untagged DMA, or have a choice to use
> > a PASID.
>
> I don't think we should encourage that. A PASID and all the related is
> so expensive compared to just doing normal untagged kernel DMA.
How is it expensive? Low number of PASIDs, or slowing down DMA
transactions? PASIDs aren't a scarce resource on Arm systems, they have
almost 1M unused PASIDs per VM.
Thanks,
Jean
> I assume HW has these features because virtualization use cases might
> use them, eg by using mdev to assign a command queue - then it would
> need be be contained by a PASID.
>
> Jason
On Mon, Mar 29, 2021 at 03:55:26PM -0700, Jacob Pan wrote:
> In one of the earlier discussions, I was made aware of some use cases (by
> AMD, iirc) where PASID can be used w/o IOMMU. That is why I tried to keep
> ioasid a separate subsystem. Other than that, I don't see an issue
> combining the two.
That sounds like nonsense. A freshly created ioasid should have *NO
DMA*. Every access to it should result in a PCI error until a mapping
for the address space is defined. It is called IO *address space* for
a reason.
So, what exactly do you do with a PASID without an IOMMU? You
certainly can't expose it through this interface because you can't
establish the first requirement of *NO DMA*.
While there may be an interesting use case, it looks to be kernel-only
and not relavent here.
> it matters when it comes to which interface to choose. Use /dev/ioasid to
> allocate if PASID value cannot be hidden. Use some other interface for bind
> current and allocate if a PASID is not visible to the user.
I just view it as a shortcut, it has less to do with "hidden" and more
to do if the shortcut is a valuable savings. If you swap four ioctls
with one ioctl I'd say that is not enough of a win <shrug>
> Yes, I think we are on the same page. For example, today's uacce or idxd
> driver creates a hidden PASID when user does open(), where a new WQ is
> provisioned and bound to current mm. This is the case where /dev/ioasid is
> not needed.
So that is a probelm for uacce, they shouldn't have created PASIDs at
open() time, there is no option to customize what is happening there.
> Sorry, I am not following. In the doc, I have an example to show the
> ioasid_set to VM/mm mapping. We use mm as the ioasid_set token to identify
> who the owner of an IOASID is. i.e. who allocated the IOASID. Non-owner
> cannot perform bind page table or free operations.
As I said to Kevin this seems very over complicated.
Access to the /dev/ioasid FD is the only authorization the kernel
needs.
> Yes, each PF/VF has its own PASID table. The device can do whatever
> it wants as long as the PASID is present in the table. Programming of the
> pIOMMU PASID table entry, however, is controlled by the host.
>
> IMHO, there are two levels of security here:
> 1. A PASID can only be used by a secure context
> 2. A device can only use allowed PASIDs (PASID namespace is system-wide but
> PASID table storage is per PF/VF)
>
> IOASID set is designed for #1.
#1 sounds like the mdev case, and as I said to Kevin each and every
mdev needs its own allow'd PASID list. There is no need for an ioasid
set to implement that.
> > If a device is sharing a single PCI function with different security
> > contexts (eg vfio mdev) then the device itself is responsible to
> > ensure that only the secure interface can program a PASID and a less
> > secure context can never self-enroll.
>
> If two mdevs from the same PF dev are assigned to two VMs, the PASID
> table will be shared. IOASID set ensures one VM cannot program another VM's
> PASIDs. I assume 'secure context' is per VM when it comes to host PASID.
No, the mdev device driver must enforce this directly. It is the one
that programms the physical shared HW, it is the one that needs a list
of PASID's it is allowed to program *for each mdev*
ioasid_set doesn't seem to help at all, certainly not as a concept
tied to /dev/ioasid.
> No. the mdev driver consults with IOASID core When the guest programs a
> guest PASID on to he mdev. VDCM driver does a lookup:
> host_pasid = ioasid_find_by_spid(ioasid_set, guest_pasid);
This is the wrong layering. Tell the mdev device directly what it is
allowed to do. Do not pollute the ioasid core with security stuff.
> > I'd say you shoul have a single /dev/ioasid per VM and KVM should
> > attach to that - it should get all the global events/etc that are not
> > device specific.
> >
> You mean a single /dev/ioasid FD per VM and KVM? I think that is what we
> are doing in this set. A VM process can only open /dev/ioasid once, then
> use the FD for allocation and pass the PASID for bind page table etc.
Yes, I think that is reasonable.
Tag all the IOCTL's with the IOASID number.
> > Not sure what guest-host PASID means, these have to be 1:1 for device
> > assignment to work - why would use something else for mdev?
> >
> We have G-H PASID translation. They don't have to be 1:1.
> IOASID Set Private ID (SPID) is intended as a generic solution for guest PASID.
> Could you review the secion Section: IOASID Set Private ID (SPID) in the
> doc patch?
Again this only works for MDEV? How would you do translation for a
real PF/VF?
So when you 'allow' a mdev to access a PASID you want to say:
Allow Guest PASID A, map it to host PASID B on this /dev/ioasid FD
?
That seems like a good helper library to provide for drivers to use,
but it should be a construct entirely contained in the driver.
> We also had some slides from last year. Slide 3s-6 mostly.
> https://static.sched.com/hosted_files/kvmforum2020/9f/KVM_forum_2020_PASID_MGMT_Yi_Jacob_final.pdf
I think you are trying to put too much into a giant ioasid
core. Responsibility needs to rest in more logical places, it will
simplify everything.
Jason
On Tue, Mar 30, 2021 at 03:42:24PM +0200, Jean-Philippe Brucker wrote:
> On Tue, Mar 30, 2021 at 10:07:55AM -0300, Jason Gunthorpe wrote:
> > On Fri, Mar 26, 2021 at 09:06:42AM +0100, Jean-Philippe Brucker wrote:
> >
> > > It's not inconceivable to have a control queue doing DMA tagged with
> > > PASID. The devices I know either use untagged DMA, or have a choice to use
> > > a PASID.
> >
> > I don't think we should encourage that. A PASID and all the related is
> > so expensive compared to just doing normal untagged kernel DMA.
>
> How is it expensive? Low number of PASIDs, or slowing down DMA
> transactions? PASIDs aren't a scarce resource on Arm systems, they have
> almost 1M unused PASIDs per VM.
There may be lots of PASIDs, but they are not without cost. The page
table behind them costs memory and cache occupancy, doing the lookups
hurts DMA performance.
Compare to a physical addressed kernel DMA (like x86 often sets up)
the runtime overheads from unnecessary PASID use is quite big.
Jason
Hi Jason,
On Tue, 30 Mar 2021 10:43:13 -0300, Jason Gunthorpe <[email protected]> wrote:
> > If two mdevs from the same PF dev are assigned to two VMs, the PASID
> > table will be shared. IOASID set ensures one VM cannot program another
> > VM's PASIDs. I assume 'secure context' is per VM when it comes to host
> > PASID.
>
> No, the mdev device driver must enforce this directly. It is the one
> that programms the physical shared HW, it is the one that needs a list
> of PASID's it is allowed to program *for each mdev*
>
This requires the mdev driver to obtain a list of allowed PASIDs(possibly
during PASID bind time) prior to do enforcement. IMHO, the PASID enforcement
points are:
1. During WQ configuration (e.g.program MSI)
2. During work submission
For VT-d shared workqueue, there is no way to enforce #2 in mdev driver in
that the PASID is obtained from PASID MSR from the CPU and submitted w/o
driver involvement. The enforcement for #2 is in the KVM PASID translation
table, which is per VM.
For our current VFIO mdev model, bind guest page table does not involve
mdev driver. So this is a gap we must fill, i.e. include a callback from
mdev driver?
> ioasid_set doesn't seem to help at all, certainly not as a concept
> tied to /dev/ioasid.
>
Yes, we can take the security role off ioasid_set once we have per mdev
list. However, ioasid_set being a per VM/mm entity also bridge
communications among kernel subsystems that don't have direct call path.
e.g. KVM, VDCM and IOMMU.
> > No. the mdev driver consults with IOASID core When the guest programs a
> > guest PASID on to he mdev. VDCM driver does a lookup:
> > host_pasid = ioasid_find_by_spid(ioasid_set, guest_pasid);
>
> This is the wrong layering. Tell the mdev device directly what it is
> allowed to do. Do not pollute the ioasid core with security stuff.
>
> > > I'd say you shoul have a single /dev/ioasid per VM and KVM should
> > > attach to that - it should get all the global events/etc that are not
> > > device specific.
> > >
> > You mean a single /dev/ioasid FD per VM and KVM? I think that is what we
> > are doing in this set. A VM process can only open /dev/ioasid once, then
> > use the FD for allocation and pass the PASID for bind page table etc.
>
> Yes, I think that is reasonable.
>
> Tag all the IOCTL's with the IOASID number.
>
> > > Not sure what guest-host PASID means, these have to be 1:1 for device
> > > assignment to work - why would use something else for mdev?
> > >
> > We have G-H PASID translation. They don't have to be 1:1.
> > IOASID Set Private ID (SPID) is intended as a generic solution for
> > guest PASID. Could you review the secion Section: IOASID Set Private ID
> > (SPID) in the doc patch?
>
> Again this only works for MDEV? How would you do translation for a
> real PF/VF?
>
Right, we will need some mediation for PF/VF.
> So when you 'allow' a mdev to access a PASID you want to say:
> Allow Guest PASID A, map it to host PASID B on this /dev/ioasid FD
>
> ?
>
Host and guest PASID value, as well as device info are available through
iommu_uapi_sva_bind_gpasid(), we just need to feed that info to mdev driver.
> That seems like a good helper library to provide for drivers to use,
> but it should be a construct entirely contained in the driver.
why? would it be cleaner if it is in the common code?
Thanks,
Jacob
Hi Jason,
> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, March 30, 2021 9:29 PM
>
> On Tue, Mar 30, 2021 at 01:37:05AM +0000, Tian, Kevin wrote:
[...]
> > Hi, Jason,
> >
> > Actually above is a major open while we are refactoring vSVA uAPI toward
> > this direction. We have two concerns about merging /dev/ioasid with
> > /dev/sva, and would like to hear your thought whether they are valid.
> >
> > First, userspace may use ioasid in a non-SVA scenario where ioasid is
> > bound to specific security context (e.g. a control vq in vDPA) instead of
> > tying to mm. In this case there is no pgtable binding initiated from user
> > space. Instead, ioasid is allocated from /dev/ioasid and then programmed
> > to the intended security context through specific passthrough framework
> > which manages that context.
>
> This sounds like the exact opposite of what I'd like to see.
>
> I do not want to see every subsystem gaining APIs to program a
> PASID. All of that should be consolidated in *one place*.
>
> I do not want to see VDPA and VFIO have two nearly identical sets of
> APIs to control the PASID.
>
> Drivers consuming a PASID, like VDPA, should consume the PASID and do
> nothing more than authorize the HW to use it.
>
> quemu should have general code under the viommu driver that drives
> /dev/ioasid to create PASID's and manage the IO mapping according to
> the guest's needs.
>
> Drivers like VDPA and VFIO should simply accept that PASID and
> configure/authorize their HW to do DMA's with its tag.
>
> > Second, ioasid is managed per process/VM while pgtable binding is a
> > device-wise operation. The userspace flow looks like below for an integral
> > /dev/ioasid interface:
> >
> > - ioctl(container->fd, VFIO_SET_IOMMU, VFIO_TYPE1_NESTING_IOMMU)
> > - ioasid_fd = open(/dev/ioasid)
> > - ioctl(ioasid_fd, IOASID_GET_USVA_FD, &sva_fd) //an empty context
> > - ioctl(device->fd, VFIO_DEVICE_SET_SVA, &sva_fd); //sva_fd ties to
> device
> > - ioctl(sva_fd, USVA_GET_INFO, &sva_info);
> > - ioctl(ioasid_fd, IOMMU_ALLOC_IOASID, &ioasid);
> > - ioctl(sva_fd, USVA_BIND_PGTBL, &bind_data);
> > - ioctl(sva_fd, USVA_FLUSH_CACHE, &inv_info);
> > - ioctl(sva_fd, USVA_UNBIND_PGTBL, &unbind_data);
> > - ioctl(device->fd, VFIO_DEVICE_UNSET_SVA, &sva_fd);
> > - close(sva_fd)
> > - close(ioasid_fd)
> >
> > Our hesitation here is based on one of your earlier comments that
> > you are not a fan of constructing fd's through ioctl. Are you OK with
> > above flow or have a better idea of handling it?
>
> My reaction is to squash 'sva' and ioasid fds together, I can't see
> why you'd need two fds to manipulate a PASID.
The reason is /dev/ioasid FD is per-VM since the ioasid allocated to
the VM should be able to be shared by all assigned device for the VM.
But the SVA operations (bind/unbind page table, cache_invalidate) should
be per-device. If squashing the two fds to be one, then requires a device
tag for each vSVA ioctl. I'm not sure if it is good. Per me, it looks
better to have a SVA FD and associated with a device FD so that any ioctl
on it will be in the device level. This also benefits ARM and AMD's vSVA
support since they binds guest PASID table to host instead of binding
guest page tables to specific PASIDs.
Regards,
Yi Liu
> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, March 30, 2021 9:28 PM
>
> On Tue, Mar 30, 2021 at 04:14:58AM +0000, Tian, Kevin wrote:
>
> > One correction. The mdev should still construct the list of allowed PASID's
> as
> > you said (by listening to IOASID_BIND/UNBIND event), in addition to the
> ioasid
> > set maintained per VM (updated when a PASID is allocated/freed). The
> per-VM
> > set is required for inter-VM isolation (verified when a pgtable is bound to
> the
> > mdev/PASID), while the mdev's own list is necessary for intra-VM isolation
> when
> > multiple mdevs are assigned to the same VM (verified before loading a
> PASID
> > to the mdev). This series just handles the general part i.e. per-VM ioasid
> set and
> > leaves the mdev's own list to be managed by specific mdev driver which
> listens
> > to various IOASID events).
>
> This is better, but I don't understand why we need such a convoluted
> design.
>
> Get rid of the ioasid set.
>
> Each driver has its own list of allowed ioasids.
First, I agree with you it's necessary to have a per-device allowed ioasid
list. But besides it, I think we still need to ensure the ioasid used by a
VM is really allocated to this VM. A VM should not use an ioasid allocated
to another VM. right? Actually, this is the major intention for introducing
ioasid_set.
> Register a ioasid in the driver's list by passing the fd and ioasid #
The fd here is a device fd. Am I right? If yes, your idea is ioasid is
allocated via /dev/ioasid and associated with device fd via either VFIO
or vDPA ioctl. right? sorry I may be asking silly questions but really
need to ensure we are talking in the same page.
> No listening to events. A simple understandable security model.
For this suggestion, I have a little bit concern if we may have A-B/B-A
lock sequence issue since it requires the /dev/ioasid (if it supports)
to call back into VFIO/VDPA to check if the ioasid has been registered to
device FD and record it in the per-device list. right? Let's have more
discussion based on the skeleton sent by Kevin.
Regards,
Yi Liu
> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, March 30, 2021 9:43 PM
[..]
> No, the mdev device driver must enforce this directly. It is the one
> that programms the physical shared HW, it is the one that needs a list
> of PASID's it is allowed to program *for each mdev*
>
> ioasid_set doesn't seem to help at all, certainly not as a concept
> tied to /dev/ioasid.
>
As replied in another thread. We introduced ioasid_set based on the
motivation to have per-VM ioasid track, which is required when user
space tries to bind an ioasid with a device. Should ensure the ioasid
it is using was allocated to it. otherwise, we may suffer inter-VM ioasid
problem. It may not necessaty to be ioasid_set but a per-VM ioasid track
is necessary.
Regards,
Yi Liu
On Tue, Mar 30, 2021 at 05:10:41PM -0700, Jacob Pan wrote:
> Hi Jason,
>
> On Tue, 30 Mar 2021 10:43:13 -0300, Jason Gunthorpe <[email protected]> wrote:
>
> > > If two mdevs from the same PF dev are assigned to two VMs, the PASID
> > > table will be shared. IOASID set ensures one VM cannot program another
> > > VM's PASIDs. I assume 'secure context' is per VM when it comes to host
> > > PASID.
> >
> > No, the mdev device driver must enforce this directly. It is the one
> > that programms the physical shared HW, it is the one that needs a list
> > of PASID's it is allowed to program *for each mdev*
> >
> This requires the mdev driver to obtain a list of allowed PASIDs(possibly
> during PASID bind time) prior to do enforcement. IMHO, the PASID enforcement
> points are:
> 1. During WQ configuration (e.g.program MSI)
> 2. During work submission
>
> For VT-d shared workqueue, there is no way to enforce #2 in mdev driver in
> that the PASID is obtained from PASID MSR from the CPU and submitted w/o
> driver involvement.
I assume that the PASID MSR is privileged and only qemu can program
it? Otherwise this seems like a security problem.
If qemu controls it then the idxd userspace driver in qemu must ensure
it is only ever programmed to an authorized PASID.
> The enforcement for #2 is in the KVM PASID translation table, which
> is per VM.
I don't understand why KVM gets involved in PASID??
Doesn't work submission go either to the mdev driver or through the
secure PASID of #1?
> For our current VFIO mdev model, bind guest page table does not involve
> mdev driver. So this is a gap we must fill, i.e. include a callback from
> mdev driver?
No not a callback, tell the mdev driver with a VFIO IOCTL that it is
authorized to use a specific PASID because the vIOMMU was told to
allow it by the guest kernel. Simple and straightforward.
> > ioasid_set doesn't seem to help at all, certainly not as a concept
> > tied to /dev/ioasid.
> >
> Yes, we can take the security role off ioasid_set once we have per mdev
> list. However, ioasid_set being a per VM/mm entity also bridge
> communications among kernel subsystems that don't have direct call path.
> e.g. KVM, VDCM and IOMMU.
Everything should revolve around the /dev/ioasid FD. qemu should pass
it to all places that need to know about PASID's in the VM.
We should try to avoid hidden behind the scenes kernel
interconnections between subsystems.
> > So when you 'allow' a mdev to access a PASID you want to say:
> > Allow Guest PASID A, map it to host PASID B on this /dev/ioasid FD
> >
> Host and guest PASID value, as well as device info are available through
> iommu_uapi_sva_bind_gpasid(), we just need to feed that info to mdev driver.
You need that IOCTL to exist on the *mdev driver*. It is a VFIO ioctl,
not a iommu or ioasid or sva IOCTL.
> > That seems like a good helper library to provide for drivers to use,
> > but it should be a construct entirely contained in the driver.
> why? would it be cleaner if it is in the common code?
No, it is the "mid layer" problematic design.
Having the iommu layer store driver-specific data on behalf of a
driver will just make a mess. Use the natural layering we have and
store driver specific data in the driver structs.
Add a library to help build the datastructure if it necessary.
Jason
On Wed, Mar 31, 2021 at 07:41:40AM +0000, Liu, Yi L wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Tuesday, March 30, 2021 9:28 PM
> >
> > On Tue, Mar 30, 2021 at 04:14:58AM +0000, Tian, Kevin wrote:
> >
> > > One correction. The mdev should still construct the list of allowed PASID's
> > as
> > > you said (by listening to IOASID_BIND/UNBIND event), in addition to the
> > ioasid
> > > set maintained per VM (updated when a PASID is allocated/freed). The
> > per-VM
> > > set is required for inter-VM isolation (verified when a pgtable is bound to
> > the
> > > mdev/PASID), while the mdev's own list is necessary for intra-VM isolation
> > when
> > > multiple mdevs are assigned to the same VM (verified before loading a
> > PASID
> > > to the mdev). This series just handles the general part i.e. per-VM ioasid
> > set and
> > > leaves the mdev's own list to be managed by specific mdev driver which
> > listens
> > > to various IOASID events).
> >
> > This is better, but I don't understand why we need such a convoluted
> > design.
> >
> > Get rid of the ioasid set.
> >
> > Each driver has its own list of allowed ioasids.
>
> First, I agree with you it's necessary to have a per-device allowed ioasid
> list. But besides it, I think we still need to ensure the ioasid used by a
> VM is really allocated to this VM. A VM should not use an ioasid allocated
> to another VM. right? Actually, this is the major intention for introducing
> ioasid_set.
The /dev/ioasid FD replaces this security check. By becoming FD
centric you don't need additional kernel security objects.
Any process with access to the /dev/ioasid FD is allowed to control
those PASID. The seperation between VMs falls naturally from the
seperation of FDs without creating additional, complicated, security
infrastrucure in the kernel.
This is why all APIs must be FD focused, and you need to have a
logical layering of responsibility.
Allocate a /dev/ioasid FD
Allocate PASIDs inside the FD
Assign memory to the PASIDS
Open a device FD, eg from VFIO or VDP
Instruct the device FD to authorize the device to access PASID A in
an ioasid FD
* Prior to being authorized the device will have NO access to any
PASID
* Presenting BOTH the device FD and the ioasid FD to the kernel
is the security check. Any process with both FDs is allowed to
make the connection. This is normal Unix FD centric thinking.
> > Register a ioasid in the driver's list by passing the fd and ioasid #
>
> The fd here is a device fd. Am I right?
It would be the vfio_device FD, for instance, and a VFIO IOCTL.
> If yes, your idea is ioasid is allocated via /dev/ioasid and
> associated with device fd via either VFIO or vDPA ioctl. right?
> sorry I may be asking silly questions but really need to ensure we
> are talking in the same page.
Yes, this is right
> > No listening to events. A simple understandable security model.
>
> For this suggestion, I have a little bit concern if we may have A-B/B-A
> lock sequence issue since it requires the /dev/ioasid (if it supports)
> to call back into VFIO/VDPA to check if the ioasid has been registered to
> device FD and record it in the per-device list. right? Let's have more
> discussion based on the skeleton sent by Kevin.
Callbacks would be backwards.
User calls vfio with vfio_device fd and dev/ioasid fd
VFIO extracts some kernel representation of the ioasid from the ioasid
fd using an API
VFIO does some kernel call to IOMMU/IOASID layer that says 'tell the
IOMMU that this PCI device is allowed to use this PASID'
VFIO mdev drivers then record that the PASID is allowed in its own
device specific struct for later checking during other system calls.
No lock inversions. No callbacks. Why do we need callbacks?? Simplify.
Jason
On Wed, Mar 31, 2021 at 07:38:36AM +0000, Liu, Yi L wrote:
> The reason is /dev/ioasid FD is per-VM since the ioasid allocated to
> the VM should be able to be shared by all assigned device for the VM.
> But the SVA operations (bind/unbind page table, cache_invalidate) should
> be per-device.
It is not *per-device* it is *per-ioasid*
And as /dev/ioasid is an interface for controlling multiple ioasid's
there is no issue to also multiplex the page table manipulation for
multiple ioasids as well.
What you should do next is sketch out in some RFC the exactl ioctls
each FD would have and show how the parts I outlined would work and
point out any remaining gaps.
The device FD is something like the vfio_device FD from VFIO, it has
*nothing* to do with PASID beyond having a single ioctl to authorize
the device to use the PASID. All control of the PASID is in
/dev/ioasid.
Jason
Hi Jason,
On Wed, 31 Mar 2021 09:28:05 -0300, Jason Gunthorpe <[email protected]> wrote:
> On Tue, Mar 30, 2021 at 05:10:41PM -0700, Jacob Pan wrote:
> [...]
> [...]
> [...]
> > This requires the mdev driver to obtain a list of allowed
> > PASIDs(possibly during PASID bind time) prior to do enforcement. IMHO,
> > the PASID enforcement points are:
> > 1. During WQ configuration (e.g.program MSI)
> > 2. During work submission
> >
> > For VT-d shared workqueue, there is no way to enforce #2 in mdev driver
> > in that the PASID is obtained from PASID MSR from the CPU and submitted
> > w/o driver involvement.
>
> I assume that the PASID MSR is privileged and only qemu can program
> it? Otherwise this seems like a security problem.
>
yes.
> If qemu controls it then the idxd userspace driver in qemu must ensure
> it is only ever programmed to an authorized PASID.
>
it is ensured for #1.
> > The enforcement for #2 is in the KVM PASID translation table, which
> > is per VM.
>
> I don't understand why KVM gets involved in PASID??
>
Here is an excerpt from the SIOV spec.
https://software.intel.com/content/www/us/en/develop/download/intel-scalable-io-virtualization-technical-specification.html
"3.3 PASID translation
To support PASID isolation for Shared Work Queues used by VMs, the CPU must
provide a way for the PASID to be communicated to the device in the DMWr
transaction. On Intel CPUs, the CPU provides a PASID translation table in
the vCPUs virtual machine control structures. During ENQCMD/ENQCMDS
instruction execution in a VM, the PASID translation table is used by the
CPU to replace the guest PASID in the work descriptor with a host PASID
before the descriptor is sent to the device.3.3 PASID translation"
> Doesn't work submission go either to the mdev driver or through the
> secure PASID of #1?
>
No, once a PASID is bound with IOMMU, KVM, and the mdev, work submission is
all done in HW.
But I don't think this will change for either uAPI design.
> > For our current VFIO mdev model, bind guest page table does not involve
> > mdev driver. So this is a gap we must fill, i.e. include a callback from
> > mdev driver?
>
> No not a callback, tell the mdev driver with a VFIO IOCTL that it is
> authorized to use a specific PASID because the vIOMMU was told to
> allow it by the guest kernel. Simple and straightforward.
>
Make sense.
> > > ioasid_set doesn't seem to help at all, certainly not as a concept
> > > tied to /dev/ioasid.
> > >
> > Yes, we can take the security role off ioasid_set once we have per mdev
> > list. However, ioasid_set being a per VM/mm entity also bridge
> > communications among kernel subsystems that don't have direct call path.
> > e.g. KVM, VDCM and IOMMU.
>
> Everything should revolve around the /dev/ioasid FD. qemu should pass
> it to all places that need to know about PASID's in the VM.
>
I guess we need to extend KVM interface to support PASIDs. Our original
intention was to avoid introducing new interfaces.
> We should try to avoid hidden behind the scenes kernel
> interconnections between subsystems.
>
Can we? in case of exception. Since all these IOCTLs are coming from the
unreliable user space, we must deal all exceptions.
For example, when user closes /dev/ioasid FD before (or w/o) unbind IOCTL
for VFIO, KVM, kernel must do cleanup and coordinate among subsystems.
In this patchset, we have a per mm(ioasid_set) notifier to inform mdev, KVM
to clean up and drop its refcount. Do you have any suggestion on this?
>
> > > So when you 'allow' a mdev to access a PASID you want to say:
> > > Allow Guest PASID A, map it to host PASID B on this /dev/ioasid FD
> > >
>
> > Host and guest PASID value, as well as device info are available through
> > iommu_uapi_sva_bind_gpasid(), we just need to feed that info to mdev
> > driver.
>
> You need that IOCTL to exist on the *mdev driver*. It is a VFIO ioctl,
> not a iommu or ioasid or sva IOCTL.
>
OK. A separate IOCTL and separate step.
> > > That seems like a good helper library to provide for drivers to use,
> > > but it should be a construct entirely contained in the driver.
> > why? would it be cleaner if it is in the common code?
>
> No, it is the "mid layer" problematic design.
>
> Having the iommu layer store driver-specific data on behalf of a
> driver will just make a mess. Use the natural layering we have and
> store driver specific data in the driver structs.
>
> Add a library to help build the datastructure if it necessary.
>
Let me try to paraphrase, you are suggesting common helper code and data
format but still driver specific storage of the mapping, correct?
Will try this out, seems cleaner.
> Jason
Thanks,
Jacob
On Wed, Mar 31, 2021 at 09:34:57AM -0700, Jacob Pan wrote:
> "3.3 PASID translation
> To support PASID isolation for Shared Work Queues used by VMs, the CPU must
> provide a way for the PASID to be communicated to the device in the DMWr
> transaction. On Intel CPUs, the CPU provides a PASID translation table in
> the vCPUs virtual machine control structures. During ENQCMD/ENQCMDS
> instruction execution in a VM, the PASID translation table is used by the
> CPU to replace the guest PASID in the work descriptor with a host PASID
> before the descriptor is sent to the device.3.3 PASID translation"
Yikes, a special ENQCMD table in the hypervisor!
Still, pass the /dev/ioasid into a KVM IOCTL and tell it to populate
this table. KVM only adds to the table when userspace presents a
/dev/ioasid FD.
> > Doesn't work submission go either to the mdev driver or through the
> > secure PASID of #1?
>
> No, once a PASID is bound with IOMMU, KVM, and the mdev, work
> submission is all done in HW. But I don't think this will change
> for either uAPI design.
The big note here is "only for things that use ENQCMD" and that is
basically nothing these days.
> > Everything should revolve around the /dev/ioasid FD. qemu should pass
> > it to all places that need to know about PASID's in the VM.
>
> I guess we need to extend KVM interface to support PASIDs. Our original
> intention was to avoid introducing new interfaces.
New features need new interfaces, especially if there is a security
sensitivity! KVM should *not* automatically opt into security
sensitive stuff without being explicitly told what to do.
Here you'd need to authorized *two* things for IDXD:
- The mdev needs to be told it is allowed to use PASID, this tells
the IOMMU driver to connect the pci device under the mdev
- KVM needs to be told to populate a vPASID to the 'ENQCMD'
security table translated to a physical PASID.
If qemu doesn't explicitly enable the ENQCMD security table it should
be *left disabled* by KVM - even if someone else is using PASID in the
same process. And the API should be narrow like this just to the
EQNCMD table as who knows what will come down the road, or how it will
work.
Having a PASID wrongly leak out into the VM would be a security
disaster. Be explicit.
> > We should try to avoid hidden behind the scenes kernel
> > interconnections between subsystems.
> >
> Can we? in case of exception. Since all these IOCTLs are coming from the
> unreliable user space, we must deal all exceptions.
>
> For example, when user closes /dev/ioasid FD before (or w/o) unbind IOCTL
> for VFIO, KVM, kernel must do cleanup and coordinate among subsystems.
> In this patchset, we have a per mm(ioasid_set) notifier to inform mdev, KVM
> to clean up and drop its refcount. Do you have any suggestion on this?
The ioasid should be a reference counted object.
When the FD is closed, or the ioasid is "destroyed" it just blocks DMA
and parks the PASID until *all* places release it. Upon a zero
refcount the PASID is recycled for future use.
The duration between unmapping the ioasid and releasing all HW access
will have HW see PCIE TLP errors due to the blocked access. If
userspace messes up the order it is fine to cause this. We already had
this dicussion when talking about how to deal with process exit in the
simple SVA case.
> Let me try to paraphrase, you are suggesting common helper code and data
> format but still driver specific storage of the mapping, correct?
The driver just needs to hold the datastructure in its memory.
Like an xarray, the driver can have an xarray inside its struct
device, but the xarray library provides all the implementation.
Jason
Hi Jason,
On Wed, 31 Mar 2021 14:31:48 -0300, Jason Gunthorpe <[email protected]> wrote:
> > > We should try to avoid hidden behind the scenes kernel
> > > interconnections between subsystems.
> > >
> > Can we? in case of exception. Since all these IOCTLs are coming from the
> > unreliable user space, we must deal all exceptions.
> >
> > For example, when user closes /dev/ioasid FD before (or w/o) unbind
> > IOCTL for VFIO, KVM, kernel must do cleanup and coordinate among
> > subsystems. In this patchset, we have a per mm(ioasid_set) notifier to
> > inform mdev, KVM to clean up and drop its refcount. Do you have any
> > suggestion on this?
>
> The ioasid should be a reference counted object.
>
yes, this is done in this patchset.
> When the FD is closed, or the ioasid is "destroyed" it just blocks DMA
> and parks the PASID until *all* places release it. Upon a zero
> refcount the PASID is recycled for future use.
>
Just to clarify, you are saying (when FREE happens before proper
teardown) there is no need to proactively notify all users of the IOASID to
drop their reference. Instead, just wait for the other parties to naturally
close and drop their references. Am I understanding you correctly?
I feel having the notifications can add two values:
1. Shorten the duration of errors (as you mentioned below), FD close can
take a long and unpredictable time. e.g. FD shared.
2. Provide teardown ordering among PASID users. i.e. vCPU, IOMMU, mdev.
> The duration between unmapping the ioasid and releasing all HW access
> will have HW see PCIE TLP errors due to the blocked access. If
> userspace messes up the order it is fine to cause this. We already had
> this dicussion when talking about how to deal with process exit in the
> simple SVA case.
Yes, we have disabled fault reporting during this period. The slight
differences vs. the simple SVA case is that KVM is also involved and there
might be an ordering requirement to stop vCPU first.
Thanks,
Jacob
On Wed, Mar 31, 2021 at 11:20:30AM -0700, Jacob Pan wrote:
> Hi Jason,
>
> On Wed, 31 Mar 2021 14:31:48 -0300, Jason Gunthorpe <[email protected]> wrote:
>
> > > > We should try to avoid hidden behind the scenes kernel
> > > > interconnections between subsystems.
> > > >
> > > Can we? in case of exception. Since all these IOCTLs are coming from the
> > > unreliable user space, we must deal all exceptions.
> > >
> > > For example, when user closes /dev/ioasid FD before (or w/o) unbind
> > > IOCTL for VFIO, KVM, kernel must do cleanup and coordinate among
> > > subsystems. In this patchset, we have a per mm(ioasid_set) notifier to
> > > inform mdev, KVM to clean up and drop its refcount. Do you have any
> > > suggestion on this?
> >
> > The ioasid should be a reference counted object.
> >
> yes, this is done in this patchset.
>
> > When the FD is closed, or the ioasid is "destroyed" it just blocks DMA
> > and parks the PASID until *all* places release it. Upon a zero
> > refcount the PASID is recycled for future use.
> >
> Just to clarify, you are saying (when FREE happens before proper
> teardown) there is no need to proactively notify all users of the IOASID to
> drop their reference. Instead, just wait for the other parties to naturally
> close and drop their references. Am I understanding you correctly?
Yes. What are receivers going to do when you notify them anyhow? What
will a mdev do? This is how you get into they crazy locking problems.
It is an error for userspace to shutdown like this, recover sensibly
and don't crash the kernel. PCIe error TLPs are expected, supress
them. That is what we decided on the mmu notifier discussion.
> I feel having the notifications can add two values:
> 1. Shorten the duration of errors (as you mentioned below), FD close can
> take a long and unpredictable time. e.g. FD shared.
Only if userspace exits in some uncontrolled way. In a controlled exit
it can close all the FDs in the right order.
It is OK if userspace does something weird and ends up with disabled
IOASIDs. It shouldn't do that if it cares.
> 2. Provide teardown ordering among PASID users. i.e. vCPU, IOMMU, mdev.
This is a hard ask too, there is no natural ordering here I can see,
obviously we want vcpu, mdev, iommu for qemu but that doesn't seem to
fall out unless we explicitly hard wire it into the kernel.
Doesn't kvm always kill the vCPU first based on the mmu notifier
shooting down all the memory? IIRC this happens before FD close?
> > The duration between unmapping the ioasid and releasing all HW access
> > will have HW see PCIE TLP errors due to the blocked access. If
> > userspace messes up the order it is fine to cause this. We already had
> > this dicussion when talking about how to deal with process exit in the
> > simple SVA case.
> Yes, we have disabled fault reporting during this period. The slight
> differences vs. the simple SVA case is that KVM is also involved and there
> might be an ordering requirement to stop vCPU first.
KVM can continue to use the PASIDs, they are parked and DMA is
permanently blocked. When KVM reaches a natural point in its teardown
it can release them.
If you have to stop the vcpu from a iommu notifier you are in the
crazy locking world I mentioned. IMHO don't create exciting locking
problems just to avoid PCI errors in uncontrolled shutdown.
Suppress the errors instead.
Jason
Hi Jason,
On Wed, 31 Mar 2021 15:33:24 -0300, Jason Gunthorpe <[email protected]> wrote:
> On Wed, Mar 31, 2021 at 11:20:30AM -0700, Jacob Pan wrote:
> > Hi Jason,
> >
> > On Wed, 31 Mar 2021 14:31:48 -0300, Jason Gunthorpe <[email protected]>
> > wrote:
> > > > > We should try to avoid hidden behind the scenes kernel
> > > > > interconnections between subsystems.
> > > > >
> [...]
> [...]
> > yes, this is done in this patchset.
> >
> [...]
> > Just to clarify, you are saying (when FREE happens before proper
> > teardown) there is no need to proactively notify all users of the
> > IOASID to drop their reference. Instead, just wait for the other
> > parties to naturally close and drop their references. Am I
> > understanding you correctly?
>
> Yes. What are receivers going to do when you notify them anyhow? What
> will a mdev do? This is how you get into they crazy locking problems.
>
The receivers perform cleanup work similar to normal unbind. Drain/Abort
PASID. Locking is an issue in that the atomic notifier is under IOASID
spinlock, so I provided a common ordered workqueue to let mdev drivers
queue cleanup work that cannot be done in atomic context. Not ideal. Also
need to prevent nested notifications for certain cases.
> It is an error for userspace to shutdown like this, recover sensibly
> and don't crash the kernel. PCIe error TLPs are expected, supress
> them. That is what we decided on the mmu notifier discussion.
>
> > I feel having the notifications can add two values:
> > 1. Shorten the duration of errors (as you mentioned below), FD close can
> > take a long and unpredictable time. e.g. FD shared.
>
> Only if userspace exits in some uncontrolled way. In a controlled exit
> it can close all the FDs in the right order.
>
> It is OK if userspace does something weird and ends up with disabled
> IOASIDs. It shouldn't do that if it cares.
>
Agreed.
> > 2. Provide teardown ordering among PASID users. i.e. vCPU, IOMMU, mdev.
> >
>
> This is a hard ask too, there is no natural ordering here I can see,
> obviously we want vcpu, mdev, iommu for qemu but that doesn't seem to
> fall out unless we explicitly hard wire it into the kernel.
>
The ordering problem as I understood is that it is difficult for KVM to
rendezvous all vCPUs before updating PASID translation table. So there
could be in-flight enqcmd with the stale PASID after the PASID table update
and refcount drop.
If KVM is the last one to drop the PASID refcount, the PASID could be
immediately reused and starts a new life. The in-flight enqcmd with the
stale PASID could cause problems. The likelihood and window is very small.
If we ensure KVM does PASID table update before IOMMU and mdev driver, the
stale PASID in the in-flight enqcmd would be be drained before starting
a new life.
Perhaps Yi and Kevin can explain this better.
> Doesn't kvm always kill the vCPU first based on the mmu notifier
> shooting down all the memory? IIRC this happens before FD close?
>
I don't know the answer, Kevin & Yi?
> > > The duration between unmapping the ioasid and releasing all HW access
> > > will have HW see PCIE TLP errors due to the blocked access. If
> > > userspace messes up the order it is fine to cause this. We already had
> > > this dicussion when talking about how to deal with process exit in the
> > > simple SVA case.
> > Yes, we have disabled fault reporting during this period. The slight
> > differences vs. the simple SVA case is that KVM is also involved and
> > there might be an ordering requirement to stop vCPU first.
>
> KVM can continue to use the PASIDs, they are parked and DMA is
> permanently blocked. When KVM reaches a natural point in its teardown
> it can release them.
>
> If you have to stop the vcpu from a iommu notifier you are in the
> crazy locking world I mentioned. IMHO don't create exciting locking
> problems just to avoid PCI errors in uncontrolled shutdown.
>
> Suppress the errors instead.
>
I agree, this simplify things a lot. Just need to clarify the in-flight
enqcmd case.
> Jason
Thanks,
Jacob
Hi Jason,
On Wed, 31 Mar 2021 09:38:01 -0300, Jason Gunthorpe <[email protected]> wrote:
> > > Get rid of the ioasid set.
> > >
> > > Each driver has its own list of allowed ioasids.
> [...]
>
> The /dev/ioasid FD replaces this security check. By becoming FD
> centric you don't need additional kernel security objects.
>
> Any process with access to the /dev/ioasid FD is allowed to control
> those PASID. The seperation between VMs falls naturally from the
> seperation of FDs without creating additional, complicated, security
> infrastrucure in the kernel.
>
> This is why all APIs must be FD focused, and you need to have a
> logical layering of responsibility.
>
> Allocate a /dev/ioasid FD
> Allocate PASIDs inside the FD
> Assign memory to the PASIDS
>
> Open a device FD, eg from VFIO or VDP
> Instruct the device FD to authorize the device to access PASID A in
> an ioasid FD
How do we know user provided PASID A was allocated by the ioasid FD?
Shouldn't we validate user input by tracking which PASIDs are allocated by
which ioasid FD? This is one reason why we have ioasid_set and its xarray.
> * Prior to being authorized the device will have NO access to any
> PASID
> * Presenting BOTH the device FD and the ioasid FD to the kernel
> is the security check. Any process with both FDs is allowed to
> make the connection. This is normal Unix FD centric thinking.
>
> > > Register a ioasid in the driver's list by passing the fd and ioasid #
> > >
> >
> > The fd here is a device fd. Am I right?
>
> It would be the vfio_device FD, for instance, and a VFIO IOCTL.
>
> > If yes, your idea is ioasid is allocated via /dev/ioasid and
> > associated with device fd via either VFIO or vDPA ioctl. right?
> > sorry I may be asking silly questions but really need to ensure we
> > are talking in the same page.
>
> Yes, this is right
>
> > > No listening to events. A simple understandable security model.
> >
> > For this suggestion, I have a little bit concern if we may have A-B/B-A
> > lock sequence issue since it requires the /dev/ioasid (if it supports)
> > to call back into VFIO/VDPA to check if the ioasid has been registered
> > to device FD and record it in the per-device list. right? Let's have
> > more discussion based on the skeleton sent by Kevin.
>
> Callbacks would be backwards.
>
> User calls vfio with vfio_device fd and dev/ioasid fd
>
> VFIO extracts some kernel representation of the ioasid from the ioasid
> fd using an API
>
This lookup API seems to be asking for per ioasid FD storage array. Today,
the ioasid_set is per mm and contains a Xarray. Since each VM, KVM can only
open one ioasid FD, this per FD array would be equivalent to the per mm
ioasid_set, right?
> VFIO does some kernel call to IOMMU/IOASID layer that says 'tell the
> IOMMU that this PCI device is allowed to use this PASID'
>
Would it be redundant to what iommu_uapi_sva_bind_gpasid() does? I thought
the idea is to use ioasid FD IOCTL to issue IOMMU uAPI calls. Or we can
skip this step for now and wait for the user to do SVA bind.
> VFIO mdev drivers then record that the PASID is allowed in its own
> device specific struct for later checking during other system calls.
Thanks,
Jacob
On Wed, Mar 31, 2021 at 04:46:21PM -0700, Jacob Pan wrote:
> Hi Jason,
>
> On Wed, 31 Mar 2021 09:38:01 -0300, Jason Gunthorpe <[email protected]> wrote:
>
> > > > Get rid of the ioasid set.
> > > >
> > > > Each driver has its own list of allowed ioasids.
> > [...]
> >
> > The /dev/ioasid FD replaces this security check. By becoming FD
> > centric you don't need additional kernel security objects.
> >
> > Any process with access to the /dev/ioasid FD is allowed to control
> > those PASID. The seperation between VMs falls naturally from the
> > seperation of FDs without creating additional, complicated, security
> > infrastrucure in the kernel.
> >
> > This is why all APIs must be FD focused, and you need to have a
> > logical layering of responsibility.
> >
> > Allocate a /dev/ioasid FD
> > Allocate PASIDs inside the FD
> > Assign memory to the PASIDS
> >
> > Open a device FD, eg from VFIO or VDP
> > Instruct the device FD to authorize the device to access PASID A in
> > an ioasid FD
> How do we know user provided PASID A was allocated by the ioasid FD?
You pass in the ioasid FD and use a 'get pasid from fdno' API to
extract the required kernel structure.
> Shouldn't we validate user input by tracking which PASIDs are
> allocated by which ioasid FD?
Yes, but it is integral to the ioasid FD, not something separated.
> > VFIO extracts some kernel representation of the ioasid from the ioasid
> > fd using an API
> >
> This lookup API seems to be asking for per ioasid FD storage array. Today,
> the ioasid_set is per mm and contains a Xarray.
Right, put the xarray per FD. A set per mm is fairly nonsensical, we
don't use the mm as that kind of security key.
> Since each VM, KVM can only open one ioasid FD, this per FD array
> would be equivalent to the per mm ioasid_set, right?
Why only one? Each interaction with the other FDs should include the
PASID/FD pair. There is no restriction to just one.
> > VFIO does some kernel call to IOMMU/IOASID layer that says 'tell the
> > IOMMU that this PCI device is allowed to use this PASID'
>
> Would it be redundant to what iommu_uapi_sva_bind_gpasid() does? I thought
> the idea is to use ioasid FD IOCTL to issue IOMMU uAPI calls. Or we can
> skip this step for now and wait for the user to do SVA bind.
I'm not sure what you are asking.
Possibly some of the IOMMU API will need a bit adjusting to make
things split.
The act of programming the page tables and the act of authorizing a
PCI BDF to use a PASID are distinct things with two different IOCTLs.
iommu_uapi_sva_bind_gpasid() is never called by anything, and it's
uAPI is never implemented.
Joerg? Why did you merge dead uapi and dead code?
Jason
> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, March 31, 2021 8:41 PM
>
> On Wed, Mar 31, 2021 at 07:38:36AM +0000, Liu, Yi L wrote:
>
> > The reason is /dev/ioasid FD is per-VM since the ioasid allocated to
> > the VM should be able to be shared by all assigned device for the VM.
> > But the SVA operations (bind/unbind page table, cache_invalidate) should
> > be per-device.
>
> It is not *per-device* it is *per-ioasid*
>
> And as /dev/ioasid is an interface for controlling multiple ioasid's
> there is no issue to also multiplex the page table manipulation for
> multiple ioasids as well.
>
> What you should do next is sketch out in some RFC the exactl ioctls
> each FD would have and show how the parts I outlined would work and
> point out any remaining gaps.
>
> The device FD is something like the vfio_device FD from VFIO, it has
> *nothing* to do with PASID beyond having a single ioctl to authorize
> the device to use the PASID. All control of the PASID is in
> /dev/ioasid.
good to see this reply. Your idea is much clearer to me now. If I'm getting
you correctly. I think the skeleton is something like below:
1) userspace opens a /dev/ioasid, meanwhile there will be an ioasid
allocated and a per-ioasid context which can be used to do bind page
table and cache invalidate, an ioasid FD returned to userspace.
2) userspace passes the ioasid FD to VFIO, let it associated with a device
FD (like vfio_device FD).
3) userspace binds page table on the ioasid FD with the page table info.
4) userspace unbinds the page table on the ioasid FD
5) userspace de-associates the ioasid FD and device FD
Does above suit your outline?
If yes, I still have below concern and wish to see your opinion.
- the ioasid FD and device association will happen at runtime instead of
just happen in the setup phase.
- how about AMD and ARM's vSVA support? Their PASID allocation and page table
happens within guest. They only need to bind the guest PASID table to host.
Above model seems unable to fit them. (Jean, Eric, Jacob please feel free
to correct me)
- this per-ioasid SVA operations is not aligned with the native SVA usage
model. Native SVA bind is per-device.
Regards,
Yi Liu
Hi Jason,
> From: Liu, Yi L <[email protected]>
> Sent: Thursday, April 1, 2021 12:39 PM
>
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Wednesday, March 31, 2021 8:41 PM
> >
> > On Wed, Mar 31, 2021 at 07:38:36AM +0000, Liu, Yi L wrote:
> >
> > > The reason is /dev/ioasid FD is per-VM since the ioasid allocated to
> > > the VM should be able to be shared by all assigned device for the VM.
> > > But the SVA operations (bind/unbind page table, cache_invalidate)
> should
> > > be per-device.
> >
> > It is not *per-device* it is *per-ioasid*
> >
> > And as /dev/ioasid is an interface for controlling multiple ioasid's
> > there is no issue to also multiplex the page table manipulation for
> > multiple ioasids as well.
> >
> > What you should do next is sketch out in some RFC the exactl ioctls
> > each FD would have and show how the parts I outlined would work and
> > point out any remaining gaps.
> >
> > The device FD is something like the vfio_device FD from VFIO, it has
> > *nothing* to do with PASID beyond having a single ioctl to authorize
> > the device to use the PASID. All control of the PASID is in
> > /dev/ioasid.
>
> good to see this reply. Your idea is much clearer to me now. If I'm getting
> you correctly. I think the skeleton is something like below:
> f
> 1) userspace opens a /dev/ioasid, meanwhile there will be an ioasid
> allocated and a per-ioasid context which can be used to do bind page
> table and cache invalidate, an ioasid FD returned to userspace.
> 2) userspace passes the ioasid FD to VFIO, let it associated with a device
> FD (like vfio_device FD).
> 3) userspace binds page table on the ioasid FD with the page table info.
> 4) userspace unbinds the page table on the ioasid FD
> 5) userspace de-associates the ioasid FD and device FD
>
> Does above suit your outline?
>
> If yes, I still have below concern and wish to see your opinion.
> - the ioasid FD and device association will happen at runtime instead of
> just happen in the setup phase.
> - how about AMD and ARM's vSVA support? Their PASID allocation and page
> table
> happens within guest. They only need to bind the guest PASID table to
> host.
> Above model seems unable to fit them. (Jean, Eric, Jacob please feel free
> to correct me)
> - this per-ioasid SVA operations is not aligned with the native SVA usage
> model. Native SVA bind is per-device.
After reading your reply in https://lore.kernel.org/linux-iommu/[email protected]/#t
So you mean /dev/ioasid FD is per-VM instead of per-ioasid, so above skeleton
doesn't suit your idea. I draft below skeleton to see if our mind is the
same. But I still believe there is an open on how to fit ARM and AMD's
vSVA support in this the per-ioasid SVA operation model. thoughts?
+-----------------------------+-----------------------------------------------+
| userspace | kernel space |
+-----------------------------+-----------------------------------------------+
| ioasid_fd = | /dev/ioasid does below: |
| open("/dev/ioasid", O_RDWR);| struct ioasid_fd_ctx { |
| | struct list_head ioasid_list; |
| | ... |
| | } ifd_ctx; // ifd_ctx is per ioasid_fd |
+-----------------------------+-----------------------------------------------+
| ioctl(ioasid_fd, | /dev/ioasid does below: |
| ALLOC, &ioasid); | struct ioasid_data { |
| | ioasid_t ioasid; |
| | struct list_head device_list; |
| | struct list_head next; |
| | ... |
| | } id_data; // id_data is per ioasid |
| | |
| | list_add(&id_data.next, |
| | &ifd_ctx.ioasid_list); |
+-----------------------------+-----------------------------------------------+
| ioctl(device_fd, | VFIO does below: |
| DEVICE_ALLOW_IOASID, | 1) get ioasid_fd, check if ioasid_fd is valid |
| ioasid_fd, | 2) check if ioasid is allocated from ioasid_fd|
| ioasid); | 3) register device/domain info to /dev/ioasid |
| | tracked in id_data.device_list |
| | 4) record the ioasid in VFIO's per-device |
| | ioasid list for future security check |
+-----------------------------+-----------------------------------------------+
| ioctl(ioasid_fd, | /dev/ioasid does below: |
| BIND_PGTBL, | 1) find ioasid's id_data |
| pgtbl_data, | 2) loop the id_data.device_list and tell iommu|
| ioasid); | give ioasid access to the devices |
+-----------------------------+-----------------------------------------------+
| ioctl(ioasid_fd, | /dev/ioasid does below: |
| UNBIND_PGTBL, | 1) find ioasid's id_data |
| ioasid); | 2) loop the id_data.device_list and tell iommu|
| | clear ioasid access to the devices |
+-----------------------------+-----------------------------------------------+
| ioctl(device_fd, | VFIO does below: |
| DEVICE_DISALLOW_IOASID,| 1) check if ioasid is associated in VFIO's |
| ioasid_fd, | device ioasid list. |
| ioasid); | 2) unregister device/domain info from |
| | /dev/ioasid, clear in id_data.device_list |
+-----------------------------+-----------------------------------------------+
| ioctl(ioasid_fd, | /dev/ioasid does below: |
| FREE, ioasid); | list_del(&id_data.next); |
+-----------------------------+-----------------------------------------------+
Regards,
Yi Liu
On Thu, Apr 01, 2021 at 04:38:44AM +0000, Liu, Yi L wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Wednesday, March 31, 2021 8:41 PM
> >
> > On Wed, Mar 31, 2021 at 07:38:36AM +0000, Liu, Yi L wrote:
> >
> > > The reason is /dev/ioasid FD is per-VM since the ioasid allocated to
> > > the VM should be able to be shared by all assigned device for the VM.
> > > But the SVA operations (bind/unbind page table, cache_invalidate) should
> > > be per-device.
> >
> > It is not *per-device* it is *per-ioasid*
> >
> > And as /dev/ioasid is an interface for controlling multiple ioasid's
> > there is no issue to also multiplex the page table manipulation for
> > multiple ioasids as well.
> >
> > What you should do next is sketch out in some RFC the exactl ioctls
> > each FD would have and show how the parts I outlined would work and
> > point out any remaining gaps.
> >
> > The device FD is something like the vfio_device FD from VFIO, it has
> > *nothing* to do with PASID beyond having a single ioctl to authorize
> > the device to use the PASID. All control of the PASID is in
> > /dev/ioasid.
>
> good to see this reply. Your idea is much clearer to me now. If I'm getting
> you correctly. I think the skeleton is something like below:
>
> 1) userspace opens a /dev/ioasid, meanwhile there will be an ioasid
> allocated and a per-ioasid context which can be used to do bind page
> table and cache invalidate, an ioasid FD returned to userspace.
> 2) userspace passes the ioasid FD to VFIO, let it associated with a device
> FD (like vfio_device FD).
> 3) userspace binds page table on the ioasid FD with the page table info.
> 4) userspace unbinds the page table on the ioasid FD
> 5) userspace de-associates the ioasid FD and device FD
>
> Does above suit your outline?
Seems so
> If yes, I still have below concern and wish to see your opinion.
> - the ioasid FD and device association will happen at runtime instead of
> just happen in the setup phase.
Of course, this is required for security. The vIOMMU must perform the
device association when the guest requires it. Otherwise a guest
cannot isolate a PASID to a single process/device pair.
I'm worried Intel views the only use of PASID in a guest is with
ENQCMD, but that is not consistent with the industry. We need to see
normal nested PASID support with assigned PCI VFs.
> - how about AMD and ARM's vSVA support? Their PASID allocation and page table
> happens within guest. They only need to bind the guest PASID table to host.
> Above model seems unable to fit them. (Jean, Eric, Jacob please feel free
> to correct me)
No, everything needs the device association step or it is not
secure.
You can give a PASID to a guest and allow it to manipulate it's memory
map directly, nested under the guest's CPU page tables.
However the guest cannot authorize a PCI BDF to utilize that PASID
without going through some kind of step in the hypervisor. A Guest
should not be able to authorize a PASID for a BDF it doesn't have
access to - only the hypervisor can enforce this.
This all must also fit into the mdev model where only the
device-specific mdev driver can do the device specific PASID
authorization. A hypercall is essential, or we need to stop pretending
mdev is a good idea.
I'm sure there will be some small differences, and you should clearly
explain the entire uAPI surface so that soneone from AMD and ARM can
review it.
> - this per-ioasid SVA operations is not aligned with the native SVA usage
> model. Native SVA bind is per-device.
Seems like that is an error in native SVA.
SVA is a particular mode of the PASID's memory mapping table, it has
nothing to do with a device.
Jason
On Thu, Apr 01, 2021 at 07:04:01AM +0000, Liu, Yi L wrote:
> > - how about AMD and ARM's vSVA support? Their PASID allocation and page
> > table
> > happens within guest. They only need to bind the guest PASID table to
> > host.
In this case each VM has its own IOASID space, and the host IOASID
allocator doesn't participate. Plus this only makes sense when assigning a
whole VF to a guest, and VFIO is the tool for this. So I wouldn't shoehorn
those ops into /dev/ioasid, though we do need a transport for invalidate
commands.
> > Above model seems unable to fit them. (Jean, Eric, Jacob please feel free
> > to correct me)
> > - this per-ioasid SVA operations is not aligned with the native SVA usage
> > model. Native SVA bind is per-device.
Bare-metal SVA doesn't need /dev/ioasid either. A program uses a device
handle to either ask whether SVA is enabled, or to enable it explicitly.
With or without /dev/ioasid, that step is required. OpenCL uses the first
method - automatically enable "fine-grain system SVM" if available, and
provide a flag to userspace.
So userspace does not need to know about PASID. It's only one method for
doing SVA (some GPUs are context-switching page tables instead).
> After reading your reply in https://lore.kernel.org/linux-iommu/[email protected]/#t
> So you mean /dev/ioasid FD is per-VM instead of per-ioasid, so above skeleton
> doesn't suit your idea. I draft below skeleton to see if our mind is the
> same. But I still believe there is an open on how to fit ARM and AMD's
> vSVA support in this the per-ioasid SVA operation model. thoughts?
>
> +-----------------------------+-----------------------------------------------+
> | userspace | kernel space |
> +-----------------------------+-----------------------------------------------+
> | ioasid_fd = | /dev/ioasid does below: |
> | open("/dev/ioasid", O_RDWR);| struct ioasid_fd_ctx { |
> | | struct list_head ioasid_list; |
> | | ... |
> | | } ifd_ctx; // ifd_ctx is per ioasid_fd |
> +-----------------------------+-----------------------------------------------+
> | ioctl(ioasid_fd, | /dev/ioasid does below: |
> | ALLOC, &ioasid); | struct ioasid_data { |
> | | ioasid_t ioasid; |
> | | struct list_head device_list; |
> | | struct list_head next; |
> | | ... |
> | | } id_data; // id_data is per ioasid |
> | | |
> | | list_add(&id_data.next, |
> | | &ifd_ctx.ioasid_list); |
> +-----------------------------+-----------------------------------------------+
> | ioctl(device_fd, | VFIO does below: |
> | DEVICE_ALLOW_IOASID, | 1) get ioasid_fd, check if ioasid_fd is valid |
> | ioasid_fd, | 2) check if ioasid is allocated from ioasid_fd|
> | ioasid); | 3) register device/domain info to /dev/ioasid |
> | | tracked in id_data.device_list |
> | | 4) record the ioasid in VFIO's per-device |
> | | ioasid list for future security check |
> +-----------------------------+-----------------------------------------------+
> | ioctl(ioasid_fd, | /dev/ioasid does below: |
> | BIND_PGTBL, | 1) find ioasid's id_data |
> | pgtbl_data, | 2) loop the id_data.device_list and tell iommu|
> | ioasid); | give ioasid access to the devices |
> +-----------------------------+-----------------------------------------------+
> | ioctl(ioasid_fd, | /dev/ioasid does below: |
> | UNBIND_PGTBL, | 1) find ioasid's id_data |
> | ioasid); | 2) loop the id_data.device_list and tell iommu|
> | | clear ioasid access to the devices |
> +-----------------------------+-----------------------------------------------+
> | ioctl(device_fd, | VFIO does below: |
> | DEVICE_DISALLOW_IOASID,| 1) check if ioasid is associated in VFIO's |
> | ioasid_fd, | device ioasid list. |
> | ioasid); | 2) unregister device/domain info from |
> | | /dev/ioasid, clear in id_data.device_list |
> +-----------------------------+-----------------------------------------------+
> | ioctl(ioasid_fd, | /dev/ioasid does below: |
> | FREE, ioasid); | list_del(&id_data.next); |
> +-----------------------------+-----------------------------------------------+
Also wondering about:
* Querying IOMMU nesting capabilities before binding page tables (which
page table formats are supported?). We were planning to have a VFIO cap,
but I'm guessing we need to go back to the sysfs solution?
* Invalidation, probably an ioasid_fd ioctl?
* Page faults, page response. From and to devices, and don't necessarily
have a PASID. But needed by vdpa as well, so that's also going through
/dev/ioasid?
Thanks,
Jean
On Thu, Apr 01, 2021 at 02:08:17PM +0000, Liu, Yi L wrote:
> DMA page faults are delivered to root-complex via page request message and
> it is per-device according to PCIe spec. Page request handling flow is:
>
> 1) iommu driver receives a page request from device
> 2) iommu driver parses the page request message. Get the RID,PASID, faulted
> page and requested permissions etc.
> 3) iommu driver triggers fault handler registered by device driver with
> iommu_report_device_fault()
This seems confused.
The PASID should define how to handle the page fault, not the driver.
I don't remember any device specific actions in ATS, so what is the
driver supposed to do?
> 4) device driver's fault handler signals an event FD to notify userspace to
> fetch the information about the page fault. If it's VM case, inject the
> page fault to VM and let guest to solve it.
If the PASID is set to 'report page fault to userspace' then some
event should come out of /dev/ioasid, or be reported to a linked
eventfd, or whatever.
If the PASID is set to 'SVM' then the fault should be passed to
handle_mm_fault
And so on.
Userspace chooses what happens based on how they configure the PASID
through /dev/ioasid.
Why would a device driver get involved here?
> Eric has sent below series for the page fault reporting for VM with passthru
> device.
> https://lore.kernel.org/kvm/[email protected]/
It certainly should not be in vfio pci. Everything using a PASID needs
this infrastructure, VDPA, mdev, PCI, CXL, etc.
Jason
On Thu, Apr 01, 2021 at 10:23:55AM -0700, Jacob Pan wrote:
> Hi Jason,
>
> On Wed, 31 Mar 2021 21:37:05 -0300, Jason Gunthorpe <[email protected]> wrote:
>
> > On Wed, Mar 31, 2021 at 04:46:21PM -0700, Jacob Pan wrote:
> > > Hi Jason,
> > >
> > > On Wed, 31 Mar 2021 09:38:01 -0300, Jason Gunthorpe <[email protected]>
> > > wrote:
> > > > > > Get rid of the ioasid set.
> > > > > >
> > > > > > Each driver has its own list of allowed ioasids.
> > > > [...]
> > > >
> > > > The /dev/ioasid FD replaces this security check. By becoming FD
> > > > centric you don't need additional kernel security objects.
> > > >
> > > > Any process with access to the /dev/ioasid FD is allowed to control
> > > > those PASID. The seperation between VMs falls naturally from the
> > > > seperation of FDs without creating additional, complicated, security
> > > > infrastrucure in the kernel.
> > > >
> > > > This is why all APIs must be FD focused, and you need to have a
> > > > logical layering of responsibility.
> > > >
> > > > Allocate a /dev/ioasid FD
> > > > Allocate PASIDs inside the FD
> Just to be super clear. Do we allocate a FD for each PASID and return the
> FD to the user? Or return the plain PASID number back to the user space?
I would do multiple PASID's per /dev/ioasid FD because we expect alot
of PASIDs to be in use and we'd run into FDno limits.
> > > > Assign memory to the PASIDS
> > > >
> > > > Open a device FD, eg from VFIO or VDP
> > > > Instruct the device FD to authorize the device to access PASID A in
> > > > an ioasid FD
> > > How do we know user provided PASID A was allocated by the ioasid FD?
> >
> > You pass in the ioasid FD and use a 'get pasid from fdno' API to
> > extract the required kernel structure.
> >
> Seems you are talking about two FDs:
> - /dev/ioasid FD
No, just this one.
> - per IOASID FD
> This API ioasid = get_pasid_from_fd(dev_ioasid_fd, ioasid_fd);
> dev_ioasid_fd will find the xarray for all the PASIDs allocated under it,
> ioasid_fd wil be the index into the xarray to retrieve the actual ioasid.
> Correct?
'ioasid_fd' is just the ioasid number in whatever numberspace the
/dev/ioasid FD's use.
> > Why only one? Each interaction with the other FDs should include the
> > PASID/FD pair. There is no restriction to just one.
> OK, one per subsystem-VM. For example, if a VM has a VFIO and a VDPA
> device, it should only two /dev/ioasid FDs respectively. Correct?
No, only one.
For something like qemu's use case I mostly expect the vIOMMU driver
will open /dev/ioasid for each vIOMMU instance it creates (basically
only one)
> > The act of programming the page tables and the act of authorizing a
> > PCI BDF to use a PASID are distinct things with two different IOCTLs.
> >
> Why separate?
Because they have different owners and different layers in the
software.
It is not about use case, it is about putting the control points where
they naturally belong.
> For a complex stack like vSVA, I feel we have to reduce moving parts and do
> some divide and conquer.
uAPI should have all come together with a user and user application.
uAPI is hardest and most important part.
Jason
Hi Jason,
On Wed, 31 Mar 2021 21:37:05 -0300, Jason Gunthorpe <[email protected]> wrote:
> On Wed, Mar 31, 2021 at 04:46:21PM -0700, Jacob Pan wrote:
> > Hi Jason,
> >
> > On Wed, 31 Mar 2021 09:38:01 -0300, Jason Gunthorpe <[email protected]>
> > wrote:
> > > > > Get rid of the ioasid set.
> > > > >
> > > > > Each driver has its own list of allowed ioasids.
> > > [...]
> > >
> > > The /dev/ioasid FD replaces this security check. By becoming FD
> > > centric you don't need additional kernel security objects.
> > >
> > > Any process with access to the /dev/ioasid FD is allowed to control
> > > those PASID. The seperation between VMs falls naturally from the
> > > seperation of FDs without creating additional, complicated, security
> > > infrastrucure in the kernel.
> > >
> > > This is why all APIs must be FD focused, and you need to have a
> > > logical layering of responsibility.
> > >
> > > Allocate a /dev/ioasid FD
> > > Allocate PASIDs inside the FD
Just to be super clear. Do we allocate a FD for each PASID and return the
FD to the user? Or return the plain PASID number back to the user space?
> > > Assign memory to the PASIDS
> > >
> > > Open a device FD, eg from VFIO or VDP
> > > Instruct the device FD to authorize the device to access PASID A in
> > > an ioasid FD
> > How do we know user provided PASID A was allocated by the ioasid FD?
>
> You pass in the ioasid FD and use a 'get pasid from fdno' API to
> extract the required kernel structure.
>
Seems you are talking about two FDs:
- /dev/ioasid FD
- per IOASID FD
This API ioasid = get_pasid_from_fd(dev_ioasid_fd, ioasid_fd);
dev_ioasid_fd will find the xarray for all the PASIDs allocated under it,
ioasid_fd wil be the index into the xarray to retrieve the actual ioasid.
Correct?
> > Shouldn't we validate user input by tracking which PASIDs are
> > allocated by which ioasid FD?
>
> Yes, but it is integral to the ioasid FD, not something separated.
>
OK, if we have per IOASID FD in addition to the /dev/ioasid FD we can
validate user input.
> > > VFIO extracts some kernel representation of the ioasid from the ioasid
> > > fd using an API
> > >
> > This lookup API seems to be asking for per ioasid FD storage array.
> > Today, the ioasid_set is per mm and contains a Xarray.
>
> Right, put the xarray per FD. A set per mm is fairly nonsensical, we
> don't use the mm as that kind of security key.
>
Sounds good, one xarray per /dev/ioasid FD.
> > Since each VM, KVM can only open one ioasid FD, this per FD array
> > would be equivalent to the per mm ioasid_set, right?
>
> Why only one? Each interaction with the other FDs should include the
> PASID/FD pair. There is no restriction to just one.
>
OK, one per subsystem-VM. For example, if a VM has a VFIO and a VDPA
device, it should only two /dev/ioasid FDs respectively. Correct?
> > > VFIO does some kernel call to IOMMU/IOASID layer that says 'tell the
> > > IOMMU that this PCI device is allowed to use this PASID'
> >
> > Would it be redundant to what iommu_uapi_sva_bind_gpasid() does? I
> > thought the idea is to use ioasid FD IOCTL to issue IOMMU uAPI calls.
> > Or we can skip this step for now and wait for the user to do SVA bind.
>
> I'm not sure what you are asking.
>
> Possibly some of the IOMMU API will need a bit adjusting to make
> things split.
>
> The act of programming the page tables and the act of authorizing a
> PCI BDF to use a PASID are distinct things with two different IOCTLs.
>
Why separate? I don't see a use case to just authorize a PASID but don't
bind it with a page table. The very act of bind page table *is* the
authorization.
> iommu_uapi_sva_bind_gpasid() is never called by anything, and it's
> uAPI is never implemented.
>
Just a little background here. We have been working on the vSVA stack
since 2017. At the time, VFIO was the de facto interface for IOMMU-aware
driver framework. These uAPIs were always developed alone side with the
accompanying VFIO patches served as consumers. By the time these IOMMU uAPIs
were merged after reviews from most vendors, the VFIO patchset was
approaching maturity in around v7. This is when we suddenly saw a new
request to support VDPA, which attempted VFIO earlier but ultimately moved
away.
For a complex stack like vSVA, I feel we have to reduce moving parts and do
some divide and conquer.
> Joerg? Why did you merge dead uapi and dead code?
>
> Jason
Thanks,
Jacob
> From: Jean-Philippe Brucker <[email protected]>
> Sent: Thursday, April 1, 2021 8:05 PM
[...]
>
> Also wondering about:
>
> * Querying IOMMU nesting capabilities before binding page tables (which
> page table formats are supported?). We were planning to have a VFIO cap,
> but I'm guessing we need to go back to the sysfs solution?
I think it can also be with /dev/ioasid.
>
> * Invalidation, probably an ioasid_fd ioctl?
yeah, if we are doing bind/unbind_pagtable via ioasid_fd, then yes,
invalidation should go this way as well. This is why I worried it may
fail to meet the requirement from you and Eric.
> * Page faults, page response. From and to devices, and don't necessarily
> have a PASID. But needed by vdpa as well, so that's also going through
> /dev/ioasid?
page faults should still be per-device, but the fault event fd may be stored
in /dev/ioasid. page response would be in /dev/ioasid just like invalidation.
Regards,
Yi Liu
>
> Thanks,
> Jean
> From: Jason Gunthorpe <[email protected]>
> Sent: Thursday, April 1, 2021 9:16 PM
>
> On Thu, Apr 01, 2021 at 01:10:48PM +0000, Liu, Yi L wrote:
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Thursday, April 1, 2021 7:47 PM
> > [...]
> > > I'm worried Intel views the only use of PASID in a guest is with
> > > ENQCMD, but that is not consistent with the industry. We need to see
> > > normal nested PASID support with assigned PCI VFs.
> >
> > I'm not quire flow here. Intel also allows PASID usage in guest without
> > ENQCMD. e.g. Passthru a PF to guest, and use PASID on it without
> ENQCMD.
>
> Then you need all the parts, the hypervisor calls from the vIOMMU, and
> you can't really use a vPASID.
This is a diagram shows the vSVA setup.
.-------------. .---------------------------.
| vIOMMU | | Guest process CR3, FL only|
| | '---------------------------'
.----------------/
| PASID Entry |--- PASID cache flush -
'-------------' |
| | V
| | CR3 in GPA
'-------------'
Guest
------| Shadow |--------------------------|--------
v v v
Host
.-------------. .----------------------.
| pIOMMU | | Bind FL for GVA-GPA |
| | '----------------------'
.----------------/ |
| PASID Entry | V (Nested xlate)
'----------------\.------------------------------.
| | |SL for GPA-HPA, default domain|
| | '------------------------------'
'-------------'
Where:
- FL = First level/stage one page tables
- SL = Second level/stage two page tables
https://lore.kernel.org/linux-iommu/[email protected]/
>
> I'm not sure how Intel intends to resolve all of this.
>
> > > > - this per-ioasid SVA operations is not aligned with the native SVA
> usage
> > > > model. Native SVA bind is per-device.
> > >
> > > Seems like that is an error in native SVA.
> > >
> > > SVA is a particular mode of the PASID's memory mapping table, it has
> > > nothing to do with a device.
> >
> > I think it still has relationship with device. This is determined by the
> > DMA remapping hierarchy in hardware. e.g. Intel VT-d, the DMA isolation
> is
> > enforced first in device granularity and then PASID granularity. SVA makes
> > usage of both PASID and device granularity isolation.
>
> When the device driver authorizes a PASID the VT-d stuff should setup
> the isolation parameters for the give pci_device and PASID.
yes, both device and PASID is needed to setup VT-d stuff.
> Do not leak implementation details like this as uAPI. Authorization
> and memory map are distinct ideas with distinct interfaces. Do not mix
> them.
got you. Let's focus on the uAPI things here and leave implementation details
in RFC patches.
Thanks,
Yi Liu
> Jason
On Thu, Apr 01, 2021 at 02:05:00PM +0200, Jean-Philippe Brucker wrote:
> On Thu, Apr 01, 2021 at 07:04:01AM +0000, Liu, Yi L wrote:
> > > - how about AMD and ARM's vSVA support? Their PASID allocation and page
> > > table
> > > happens within guest. They only need to bind the guest PASID table to
> > > host.
>
> In this case each VM has its own IOASID space, and the host IOASID
> allocator doesn't participate. Plus this only makes sense when assigning a
> whole VF to a guest, and VFIO is the tool for this. So I wouldn't shoehorn
> those ops into /dev/ioasid, though we do need a transport for invalidate
> commands.
How does security work? Devices still have to be authorized to use the
PASID and this approach seems like it completely excludes mdev/vdpa
from ever using a PASID, and those are the most logical users.
> > > Above model seems unable to fit them. (Jean, Eric, Jacob please feel free
> > > to correct me)
> > > - this per-ioasid SVA operations is not aligned with the native SVA usage
> > > model. Native SVA bind is per-device.
>
> Bare-metal SVA doesn't need /dev/ioasid either.
It depends what you are doing. /dev/ioasid would provide fine grained
control over the memory mapping. It is not strict SVA, but I can see
applications where using a GPU with a pre-configured optimized mapping
could be interesting.
> A program uses a device handle to either ask whether SVA is enabled,
> or to enable it explicitly. With or without /dev/ioasid, that step
> is required. OpenCL uses the first method - automatically enable
> "fine-grain system SVM" if available, and provide a flag to
> userspace.
SVA can be done with ioasid, we can decide if it makes sense to have
shortcuts in every driver
> So userspace does not need to know about PASID. It's only one method for
> doing SVA (some GPUs are context-switching page tables instead).
Sure, there are lots of approaches. Here we are only talking about
PASID enablement. PASID has more options.
> * Page faults, page response. From and to devices, and don't necessarily
> have a PASID. But needed by vdpa as well, so that's also going through
> /dev/ioasid?
Only real PASID's should use this interface. All the not-PASID stuff
is on its own.
VPDA should accept a PASID from here and configure&authorize the real
HW to attach the PASID to all DMA's connected to the virtio queues.
Jason
On Thu, Apr 01, 2021 at 01:38:46PM +0000, Liu, Yi L wrote:
> > From: Jean-Philippe Brucker <[email protected]>
> > Sent: Thursday, April 1, 2021 8:05 PM
> [...]
> >
> > Also wondering about:
> >
> > * Querying IOMMU nesting capabilities before binding page tables (which
> > page table formats are supported?). We were planning to have a VFIO cap,
> > but I'm guessing we need to go back to the sysfs solution?
>
> I think it can also be with /dev/ioasid.
Sure, anything to do with page table formats and setting page tables
should go through ioasid.
> > * Invalidation, probably an ioasid_fd ioctl?
>
> yeah, if we are doing bind/unbind_pagtable via ioasid_fd, then yes,
> invalidation should go this way as well. This is why I worried it may
> fail to meet the requirement from you and Eric.
Yes, all manipulation of page tables, including removing memory ranges, or
setting memory ranges to trigger a page fault behavior should go
through here.
> > * Page faults, page response. From and to devices, and don't necessarily
> > have a PASID. But needed by vdpa as well, so that's also going through
> > /dev/ioasid?
>
> page faults should still be per-device, but the fault event fd may be stored
> in /dev/ioasid. page response would be in /dev/ioasid just like invalidation.
Here you mean non-SVA page faults that are delegated to userspace to handle?
Why would that be per-device?
Can you show the flow you imagine?
Jason
On Thu, Apr 01, 2021 at 01:10:48PM +0000, Liu, Yi L wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Thursday, April 1, 2021 7:47 PM
> [...]
> > I'm worried Intel views the only use of PASID in a guest is with
> > ENQCMD, but that is not consistent with the industry. We need to see
> > normal nested PASID support with assigned PCI VFs.
>
> I'm not quire flow here. Intel also allows PASID usage in guest without
> ENQCMD. e.g. Passthru a PF to guest, and use PASID on it without ENQCMD.
Then you need all the parts, the hypervisor calls from the vIOMMU, and
you can't really use a vPASID.
I'm not sure how Intel intends to resolve all of this.
> > > - this per-ioasid SVA operations is not aligned with the native SVA usage
> > > model. Native SVA bind is per-device.
> >
> > Seems like that is an error in native SVA.
> >
> > SVA is a particular mode of the PASID's memory mapping table, it has
> > nothing to do with a device.
>
> I think it still has relationship with device. This is determined by the
> DMA remapping hierarchy in hardware. e.g. Intel VT-d, the DMA isolation is
> enforced first in device granularity and then PASID granularity. SVA makes
> usage of both PASID and device granularity isolation.
When the device driver authorizes a PASID the VT-d stuff should setup
the isolation parameters for the give pci_device and PASID.
Do not leak implementation details like this as uAPI. Authorization
and memory map are distinct ideas with distinct interfaces. Do not mix
them.
Jason
On Thu, Apr 01, 2021 at 01:43:36PM +0000, Liu, Yi L wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Thursday, April 1, 2021 9:16 PM
> >
> > On Thu, Apr 01, 2021 at 01:10:48PM +0000, Liu, Yi L wrote:
> > > > From: Jason Gunthorpe <[email protected]>
> > > > Sent: Thursday, April 1, 2021 7:47 PM
> > > [...]
> > > > I'm worried Intel views the only use of PASID in a guest is with
> > > > ENQCMD, but that is not consistent with the industry. We need to see
> > > > normal nested PASID support with assigned PCI VFs.
> > >
> > > I'm not quire flow here. Intel also allows PASID usage in guest without
> > > ENQCMD. e.g. Passthru a PF to guest, and use PASID on it without
> > ENQCMD.
> >
> > Then you need all the parts, the hypervisor calls from the vIOMMU, and
> > you can't really use a vPASID.
>
> This is a diagram shows the vSVA setup.
I'm not talking only about vSVA. Generic PASID support with arbitary
mappings.
And how do you deal with the vPASID vs pPASID issue if the system has
a mix of physical devices and mdevs?
Jason
On Thu, Apr 01, 2021 at 07:04:01AM +0000, Liu, Yi L wrote:
> After reading your reply in https://lore.kernel.org/linux-iommu/[email protected]/#t
> So you mean /dev/ioasid FD is per-VM instead of per-ioasid, so above skeleton
> doesn't suit your idea.
You can do it one PASID per FD or multiple PASID's per FD. Most likely
we will have high numbers of PASID's in a qemu process so I assume
that number of FDs will start to be a contraining factor, thus
multiplexing is reasonable.
It doesn't really change anything about the basic flow.
digging deeply into it either seems like a reasonable choice.
> +-----------------------------+-----------------------------------------------+
> | userspace | kernel space |
> +-----------------------------+-----------------------------------------------+
> | ioasid_fd = | /dev/ioasid does below: |
> | open("/dev/ioasid", O_RDWR);| struct ioasid_fd_ctx { |
> | | struct list_head ioasid_list; |
> | | ... |
> | | } ifd_ctx; // ifd_ctx is per ioasid_fd |
Sure, possibly an xarray not a list
> +-----------------------------+-----------------------------------------------+
> | ioctl(ioasid_fd, | /dev/ioasid does below: |
> | ALLOC, &ioasid); | struct ioasid_data { |
> | | ioasid_t ioasid; |
> | | struct list_head device_list; |
> | | struct list_head next; |
> | | ... |
> | | } id_data; // id_data is per ioasid |
> | | |
> | | list_add(&id_data.next, |
> | | &ifd_ctx.ioasid_list);
> |
Yes, this should have a kref in it too
> +-----------------------------+-----------------------------------------------+
> | ioctl(device_fd, | VFIO does below: |
> | DEVICE_ALLOW_IOASID, | 1) get ioasid_fd, check if ioasid_fd is valid |
> | ioasid_fd, | 2) check if ioasid is allocated from ioasid_fd|
> | ioasid); | 3) register device/domain info to /dev/ioasid |
> | | tracked in id_data.device_list |
> | | 4) record the ioasid in VFIO's per-device |
> | | ioasid list for future security check |
You would provide a function that does steps 1&2 look at eventfd for
instance.
I'm not sure we need to register the device with the ioasid. device
should incr the kref on the ioasid_data at this point.
> +-----------------------------+-----------------------------------------------+
> | ioctl(ioasid_fd, | /dev/ioasid does below: |
> | BIND_PGTBL, | 1) find ioasid's id_data |
> | pgtbl_data, | 2) loop the id_data.device_list and tell iommu|
> | ioasid); | give ioasid access to the devices
> |
This seems backwards, DEVICE_ALLOW_IOASID should tell the iommu to
give the ioasid to the device.
Here the ioctl should be about assigning a memory map from the the current
mm_struct to the pasid
> +-----------------------------+-----------------------------------------------+
> | ioctl(ioasid_fd, | /dev/ioasid does below: |
> | UNBIND_PGTBL, | 1) find ioasid's id_data |
> | ioasid); | 2) loop the id_data.device_list and tell iommu|
> | | clear ioasid access to the devices |
Also seems backwards. The ioctl here should be 'destroy ioasid' which
wipes out the page table, halts DMA access and parks the PASID until
all users are done.
> +-----------------------------+-----------------------------------------------+
> | ioctl(device_fd, | VFIO does below: |
> | DEVICE_DISALLOW_IOASID,| 1) check if ioasid is associated in VFIO's |
> | ioasid_fd, | device ioasid list. |
> | ioasid); | 2) unregister device/domain info from |
> | | /dev/ioasid, clear in id_data.device_list |
This should disconnect the iommu and kref_put the ioasid_data
Remember the layering, only the device_fd knows what the pci_device is
that it is touching, it doesn't make alot of sense to leak that into
the ioasid world that should only be dealing with the page table
mapping.
> +-----------------------------+-----------------------------------------------+
> | ioctl(ioasid_fd, | /dev/ioasid does below: |
> | FREE, ioasid); | list_del(&id_data.next); |
> +-----------------------------+-----------------------------------------------+
Don't know if we need a free. The sequence above is backwards, the
page table should be setup, the device authorized, device
de-authorized then page table destroyed. PASID recycles once everyone
is released.
Include a sequence showing how the kvm FD is used to program the
vPASID to pPASID table that ENQCMD uses.
Show how dynamic authorization works based on requests from the
guest's vIOMMU
Jason
> From: Jason Gunthorpe <[email protected]>
> Sent: Thursday, April 1, 2021 7:47 PM
[...]
> I'm worried Intel views the only use of PASID in a guest is with
> ENQCMD, but that is not consistent with the industry. We need to see
> normal nested PASID support with assigned PCI VFs.
I'm not quire flow here. Intel also allows PASID usage in guest without
ENQCMD. e.g. Passthru a PF to guest, and use PASID on it without ENQCMD.
[...]
> I'm sure there will be some small differences, and you should clearly
> explain the entire uAPI surface so that soneone from AMD and ARM can
> review it.
good suggestion, will do.
> > - this per-ioasid SVA operations is not aligned with the native SVA usage
> > model. Native SVA bind is per-device.
>
> Seems like that is an error in native SVA.
>
> SVA is a particular mode of the PASID's memory mapping table, it has
> nothing to do with a device.
I think it still has relationship with device. This is determined by the
DMA remapping hierarchy in hardware. e.g. Intel VT-d, the DMA isolation is
enforced first in device granularity and then PASID granularity. SVA makes
usage of both PASID and device granularity isolation.
Regards,
Yi Liu
> Jason
> From: Jason Gunthorpe <[email protected]>
> Sent: Thursday, April 1, 2021 9:43 PM
>
> On Thu, Apr 01, 2021 at 01:38:46PM +0000, Liu, Yi L wrote:
> > > From: Jean-Philippe Brucker <[email protected]>
> > > Sent: Thursday, April 1, 2021 8:05 PM
> > [...]
> > >
> > > Also wondering about:
> > >
> > > * Querying IOMMU nesting capabilities before binding page tables
> (which
> > > page table formats are supported?). We were planning to have a VFIO
> cap,
> > > but I'm guessing we need to go back to the sysfs solution?
> >
> > I think it can also be with /dev/ioasid.
>
> Sure, anything to do with page table formats and setting page tables
> should go through ioasid.
>
> > > * Invalidation, probably an ioasid_fd ioctl?
> >
> > yeah, if we are doing bind/unbind_pagtable via ioasid_fd, then yes,
> > invalidation should go this way as well. This is why I worried it may
> > fail to meet the requirement from you and Eric.
>
> Yes, all manipulation of page tables, including removing memory ranges, or
> setting memory ranges to trigger a page fault behavior should go
> through here.
>
> > > * Page faults, page response. From and to devices, and don't necessarily
> > > have a PASID. But needed by vdpa as well, so that's also going through
> > > /dev/ioasid?
> >
> > page faults should still be per-device, but the fault event fd may be stored
> > in /dev/ioasid. page response would be in /dev/ioasid just like invalidation.
>
> Here you mean non-SVA page faults that are delegated to userspace to
> handle?
no, just SVA page faults. otherwise, no need to let userspace handle.
>
> Why would that be per-device?
>
> Can you show the flow you imagine?
DMA page faults are delivered to root-complex via page request message and
it is per-device according to PCIe spec. Page request handling flow is:
1) iommu driver receives a page request from device
2) iommu driver parses the page request message. Get the RID,PASID, faulted
page and requested permissions etc.
3) iommu driver triggers fault handler registered by device driver with
iommu_report_device_fault()
4) device driver's fault handler signals an event FD to notify userspace to
fetch the information about the page fault. If it's VM case, inject the
page fault to VM and let guest to solve it.
Eric has sent below series for the page fault reporting for VM with passthru
device.
https://lore.kernel.org/kvm/[email protected]/
Regards,
Yi Liu
> Jason
> From: Jason Gunthorpe <[email protected]>
> Sent: Friday, April 2, 2021 12:04 AM
>
> On Thu, Apr 01, 2021 at 02:08:17PM +0000, Liu, Yi L wrote:
>
> > DMA page faults are delivered to root-complex via page request message
> and
> > it is per-device according to PCIe spec. Page request handling flow is:
> >
> > 1) iommu driver receives a page request from device
> > 2) iommu driver parses the page request message. Get the RID,PASID,
> faulted
> > page and requested permissions etc.
> > 3) iommu driver triggers fault handler registered by device driver with
> > iommu_report_device_fault()
>
> This seems confused.
>
> The PASID should define how to handle the page fault, not the driver.
>
> I don't remember any device specific actions in ATS, so what is the
> driver supposed to do?
>
> > 4) device driver's fault handler signals an event FD to notify userspace to
> > fetch the information about the page fault. If it's VM case, inject the
> > page fault to VM and let guest to solve it.
>
> If the PASID is set to 'report page fault to userspace' then some
> event should come out of /dev/ioasid, or be reported to a linked
> eventfd, or whatever.
>
> If the PASID is set to 'SVM' then the fault should be passed to
> handle_mm_fault
>
> And so on.
>
> Userspace chooses what happens based on how they configure the PASID
> through /dev/ioasid.
>
> Why would a device driver get involved here?
>
> > Eric has sent below series for the page fault reporting for VM with passthru
> > device.
> > https://lore.kernel.org/kvm/20210223210625.604517-5-
> [email protected]/
>
> It certainly should not be in vfio pci. Everything using a PASID needs
> this infrastructure, VDPA, mdev, PCI, CXL, etc.
>
This touches an interesting fact:
The fault may be triggered in either 1st-level or 2nd-level page table,
when nested translation is enabled (in vSVA case). The 1st-level is bound
by the user space, which therefore needs to receive the fault event. The
2nd-level is managed by VFIO (or vDPA), which needs to fix the fault in
kernel (e.g. find HVA per faulting GPA, call handle_mm_fault and map
GPA->HPA to IOMMU). Yi's current proposal lets VFIO to register the
device fault handler, which then forward the event through /dev/ioasid
to userspace only if it is a 1st-level fault. Are you suggesting a pgtable-
centric fault reporting mechanism to separate handlers in each level,
i.e. letting VFIO register handler only for 2nd-level fault and then /dev/
ioasid register handler for 1st-level fault?
Thanks
Kevin
> From: Jason Gunthorpe <[email protected]>
> Sent: Thursday, April 1, 2021 9:47 PM
>
> On Thu, Apr 01, 2021 at 01:43:36PM +0000, Liu, Yi L wrote:
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Thursday, April 1, 2021 9:16 PM
> > >
> > > On Thu, Apr 01, 2021 at 01:10:48PM +0000, Liu, Yi L wrote:
> > > > > From: Jason Gunthorpe <[email protected]>
> > > > > Sent: Thursday, April 1, 2021 7:47 PM
> > > > [...]
> > > > > I'm worried Intel views the only use of PASID in a guest is with
> > > > > ENQCMD, but that is not consistent with the industry. We need to see
> > > > > normal nested PASID support with assigned PCI VFs.
> > > >
> > > > I'm not quire flow here. Intel also allows PASID usage in guest without
> > > > ENQCMD. e.g. Passthru a PF to guest, and use PASID on it without
> > > ENQCMD.
> > >
> > > Then you need all the parts, the hypervisor calls from the vIOMMU, and
> > > you can't really use a vPASID.
> >
> > This is a diagram shows the vSVA setup.
>
> I'm not talking only about vSVA. Generic PASID support with arbitary
> mappings.
>
> And how do you deal with the vPASID vs pPASID issue if the system has
> a mix of physical devices and mdevs?
>
We plan to support two schemes. One is vPASID identity-mapped to
pPASID then the mixed scenario just works, with the limitation of
lacking of live migration support. The other is non-identity-mapped
scheme, where live migration is supported but physical devices and
mdevs should not be mixed in one VM if both expose SVA capability
(requires some filtering check in Qemu). Although we have some
idea relaxing this restriction in the non-identity scheme, it requires
more thinking given how the vSVA uAPI is being refactored.
In both cases the virtual VT-d will report a virtual capability to the guest,
indicating that the guest must request PASID through a vcmd register
instead of creating its own namespace. The vIOMMU returns a vPASID
to the guest upon request. The vPASID could be directly mapped to a
pPASID or allocated from a new namespace based on user configuration.
We hope the /dev/ioasid can support both schemes, with the minimal
requirement of allowing userspace to tag a vPASID to a pPASID and
allowing mdev to translate vPASID into pPASID, i.e. not assuming that
the guest will always use pPASID.
Thanks
Kevin
> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, March 30, 2021 9:29 PM
>
> >
> > First, userspace may use ioasid in a non-SVA scenario where ioasid is
> > bound to specific security context (e.g. a control vq in vDPA) instead of
> > tying to mm. In this case there is no pgtable binding initiated from user
> > space. Instead, ioasid is allocated from /dev/ioasid and then programmed
> > to the intended security context through specific passthrough framework
> > which manages that context.
>
> This sounds like the exact opposite of what I'd like to see.
>
> I do not want to see every subsystem gaining APIs to program a
> PASID. All of that should be consolidated in *one place*.
>
> I do not want to see VDPA and VFIO have two nearly identical sets of
> APIs to control the PASID.
>
> Drivers consuming a PASID, like VDPA, should consume the PASID and do
> nothing more than authorize the HW to use it.
>
> quemu should have general code under the viommu driver that drives
> /dev/ioasid to create PASID's and manage the IO mapping according to
> the guest's needs.
>
> Drivers like VDPA and VFIO should simply accept that PASID and
> configure/authorize their HW to do DMA's with its tag.
>
I agree with you on consolidating things in one place (especially for the
general SVA support). But here I was referring to an usage without
pgtable binding (Possibly Jason. W can say more here), where the
userspace just wants to allocate PASIDs, program/accept PASIDs to
various workqueues (device specific), and then use MAP/UNMAP
interface to manage address spaces associated with each PASID.
I just wanted to point out that the latter two steps are through
VFIO/VDPA specific interfaces.
Thanks
Kevin
> From: Tian, Kevin
> Sent: Friday, April 2, 2021 3:58 PM
>
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Thursday, April 1, 2021 9:47 PM
> >
> > On Thu, Apr 01, 2021 at 01:43:36PM +0000, Liu, Yi L wrote:
> > > > From: Jason Gunthorpe <[email protected]>
> > > > Sent: Thursday, April 1, 2021 9:16 PM
> > > >
> > > > On Thu, Apr 01, 2021 at 01:10:48PM +0000, Liu, Yi L wrote:
> > > > > > From: Jason Gunthorpe <[email protected]>
> > > > > > Sent: Thursday, April 1, 2021 7:47 PM
> > > > > [...]
> > > > > > I'm worried Intel views the only use of PASID in a guest is with
> > > > > > ENQCMD, but that is not consistent with the industry. We need to
> see
> > > > > > normal nested PASID support with assigned PCI VFs.
> > > > >
> > > > > I'm not quire flow here. Intel also allows PASID usage in guest without
> > > > > ENQCMD. e.g. Passthru a PF to guest, and use PASID on it without
> > > > ENQCMD.
> > > >
> > > > Then you need all the parts, the hypervisor calls from the vIOMMU, and
> > > > you can't really use a vPASID.
> > >
> > > This is a diagram shows the vSVA setup.
> >
> > I'm not talking only about vSVA. Generic PASID support with arbitary
> > mappings.
> >
> > And how do you deal with the vPASID vs pPASID issue if the system has
> > a mix of physical devices and mdevs?
> >
>
> We plan to support two schemes. One is vPASID identity-mapped to
> pPASID then the mixed scenario just works, with the limitation of
> lacking of live migration support. The other is non-identity-mapped
> scheme, where live migration is supported but physical devices and
> mdevs should not be mixed in one VM if both expose SVA capability
> (requires some filtering check in Qemu). Although we have some
> idea relaxing this restriction in the non-identity scheme, it requires
> more thinking given how the vSVA uAPI is being refactored.
>
> In both cases the virtual VT-d will report a virtual capability to the guest,
> indicating that the guest must request PASID through a vcmd register
> instead of creating its own namespace. The vIOMMU returns a vPASID
> to the guest upon request. The vPASID could be directly mapped to a
> pPASID or allocated from a new namespace based on user configuration.
>
> We hope the /dev/ioasid can support both schemes, with the minimal
> requirement of allowing userspace to tag a vPASID to a pPASID and
> allowing mdev to translate vPASID into pPASID, i.e. not assuming that
> the guest will always use pPASID.
>
Per your comments in other threads I suppose this requirement should
be implemented in VFIO_ALLOW_PASID command instead of going
through /dev/ioasid which only needs to know pPASID and its pgtable
management.
Thanks
Kevin
Hi Jason,
> From: Jason Gunthorpe <[email protected]>
> Sent: Thursday, April 1, 2021 7:54 PM
>
> On Thu, Apr 01, 2021 at 07:04:01AM +0000, Liu, Yi L wrote:
>
> > After reading your reply in https://lore.kernel.org/linux-
> iommu/[email protected]/#t
> > So you mean /dev/ioasid FD is per-VM instead of per-ioasid, so above
> skeleton
> > doesn't suit your idea.
>
> You can do it one PASID per FD or multiple PASID's per FD. Most likely
> we will have high numbers of PASID's in a qemu process so I assume
> that number of FDs will start to be a contraining factor, thus
> multiplexing is reasonable.
>
> It doesn't really change anything about the basic flow.
>
> digging deeply into it either seems like a reasonable choice.
>
> > +-----------------------------+-----------------------------------------------+
> > | userspace | kernel space |
> > +-----------------------------+-----------------------------------------------+
> > | ioasid_fd = | /dev/ioasid does below: |
> > | open("/dev/ioasid", O_RDWR);| struct ioasid_fd_ctx { |
> > | | struct list_head ioasid_list; |
> > | | ... |
> > | | } ifd_ctx; // ifd_ctx is per ioasid_fd |
>
> Sure, possibly an xarray not a list
>
> > +-----------------------------+-----------------------------------------------+
> > | ioctl(ioasid_fd, | /dev/ioasid does below: |
> > | ALLOC, &ioasid); | struct ioasid_data { |
> > | | ioasid_t ioasid; |
> > | | struct list_head device_list; |
> > | | struct list_head next; |
> > | | ... |
> > | | } id_data; // id_data is per ioasid |
> > | | |
> > | | list_add(&id_data.next, |
> > | | &ifd_ctx.ioasid_list);
> > |
>
> Yes, this should have a kref in it too
>
> > +-----------------------------+-----------------------------------------------+
> > | ioctl(device_fd, | VFIO does below: |
> > | DEVICE_ALLOW_IOASID, | 1) get ioasid_fd, check if ioasid_fd is valid |
> > | ioasid_fd, | 2) check if ioasid is allocated from ioasid_fd|
> > | ioasid); | 3) register device/domain info to /dev/ioasid |
> > | | tracked in id_data.device_list |
> > | | 4) record the ioasid in VFIO's per-device |
> > | | ioasid list for future security check |
>
> You would provide a function that does steps 1&2 look at eventfd for
> instance.
>
> I'm not sure we need to register the device with the ioasid. device
> should incr the kref on the ioasid_data at this point.
>
> > +-----------------------------+-----------------------------------------------+
> > | ioctl(ioasid_fd, | /dev/ioasid does below: |
> > | BIND_PGTBL, | 1) find ioasid's id_data |
> > | pgtbl_data, | 2) loop the id_data.device_list and tell iommu|
> > | ioasid); | give ioasid access to the devices
> > |
>
> This seems backwards, DEVICE_ALLOW_IOASID should tell the iommu to
> give the ioasid to the device.
>
> Here the ioctl should be about assigning a memory map from the the
> current
> mm_struct to the pasid
>
> > +-----------------------------+-----------------------------------------------+
> > | ioctl(ioasid_fd, | /dev/ioasid does below: |
> > | UNBIND_PGTBL, | 1) find ioasid's id_data |
> > | ioasid); | 2) loop the id_data.device_list and tell iommu|
> > | | clear ioasid access to the devices |
>
> Also seems backwards. The ioctl here should be 'destroy ioasid' which
> wipes out the page table, halts DMA access and parks the PASID until
> all users are done.
>
> > +-----------------------------+-----------------------------------------------+
> > | ioctl(device_fd, | VFIO does below: |
> > | DEVICE_DISALLOW_IOASID,| 1) check if ioasid is associated in VFIO's |
> > | ioasid_fd, | device ioasid list. |
> > | ioasid); | 2) unregister device/domain info from |
> > | | /dev/ioasid, clear in id_data.device_list |
>
> This should disconnect the iommu and kref_put the ioasid_data
thanks for the comments, updated the skeleton a little bit, accepted your Xarray
and kref suggestion.
+-----------------------------+------------------------------------------------+
| userspace | kernel space |
+-----------------------------+------------------------------------------------+
| ioasid_fd = | /dev/ioasid does below: |
| open("/dev/ioasid", O_RDWR);| struct ioasid_fd_ctx { |
| | struct xarray xa; |
| | ... |
| | } ifd_ctx; // ifd_ctx is per ioasid_fd |
+-----------------------------+------------------------------------------------+
| ioctl(ioasid_fd, | /dev/ioasid does below: |
| ALLOC, &ioasid); | struct ioasid_data { |
| | ioasid_t ioasid; |
| | refcount_t refs; |
| | ... |
| | } id_data; // id_data is per ioasid |
| | |
| | refcount_set(&id_data->refs, 1); |
+-----------------------------+------------------------------------------------+
| ioctl(device_fd, | VFIO does below: |
| DEVICE_ALLOW_IOASID, | 1) get ioasid_fd, check if ioasid_fd is valid |
| ioasid_fd, | 2) check if ioasid is allocated from ioasid_fd |
| ioasid); | 3) inr refcount on the ioasid |
| | 4) tell iommu to give the ioasid to the device |
| | by an iommu API. iommu driver needs to |
| | store the ioasid/device info in a per |
| | ioasid allow device list |
| | 5) record the ioasid in VFIO's per-device |
| | ioasid list for future security check |
+-----------------------------+------------------------------------------------+
| ioctl(ioasid_fd, | /dev/ioasid does below: |
| BIND_PGTBL, | 1) find ioasid's id_data |
| pgtbl_data, | 2) call into iommu driver with ioasid, pgtbl |
| ioasid); | data, iommu driver setup the PASID entry[1] |
| | with the ioasid and the pgtbl_data |
+-----------------------------+------------------------------------------------+
| ioctl(ioasid_fd, | /dev/ioasid does below: |
| CAHCE_INVLD, | 1) find ioasid's id_data |
| inv_data, | 2) call into iommu driver with ioasid, inv |
| ioasid); | data, iommu driver invalidates cache |
+-----------------------------+------------------------------------------------+
| ioctl(ioasid_fd, | /dev/ioasid does below: |
| UNBIND_PGTBL, | 1) find ioasid's id_data |
| ioasid); | 2) call into iommu driver with ioasid, iommu |
| | driver destroy the PASID entry to block DMA |
| | with this ioasid from device |
+-----------------------------+------------------------------------------------+
| ioctl(device_fd, | VFIO does below: |
| DEVICE_DISALLOW_IOASID,| 1) check if ioasid is associated in VFIO's |
| ioasid_fd, | device ioasid list |
| ioasid); | 2) tell iommu driver to clear the device from |
| | its per-ioasid device allow list |
| | 3) put refcount on the ioasid |
+-----------------------------+------------------------------------------------+
| ioctl(ioasid_fd, | /dev/ioasid does below: |
| FREE, ioasid); | list_del(&id_data.next); |
+-----------------------------+------------------------------------------------+
[1] PASID entry is an entry in a per-device PASID table, this is where the
page table pointer is stored. e.g. guest cr3 page table pointer. Setup
PASID entry in a device's PASID table means the access is finally grant
in IOMMU side.
I kept FREE as it seems to be more symmetric since there is an ALLOC
exposed to userspace. But yeah, I'm open with removing it all the same
if it's really unnecessary per your opinion.
Need your help again on an open.
The major purpose of this series is to support vSVA for guest based on
nested translation. And there is another usage case which is also based
on nested translation but don't have an ioasid. And still, it needs the
bind/unbind_pgtbl, cache_invalidation uAPI. It is gIOVA support. In this
usage, page table is a guest IOVA page table, VMM needs to bind this page
table to host and enabled nested translation, also needs to do cache
invalidation when guest IOVA page table has changes. It's very similar
with the page table bind of vSVA. Only difference is there is no ioasid
in the gIOVA case. Instead, gIOVA case requires device information. But
with regards to the uAPI reusing, need to fit gIOVA to /dev/ioasid model.
As of now, I think it may require user space passes a device FD to the
BIND/UNBIND_PGTBL and CAHCE_INVLD ioctl, then iommu driver can bind the
gIOVA page table to a correct device. Not sure if it looks good. Do you
have any suggestion on it?
[...]
> Include a sequence showing how the kvm FD is used to program the
> vPASID to pPASID table that ENQCMD uses.
>
> Show how dynamic authorization works based on requests from the
> guest's vIOMMU
I'd like to see if the updated skeleton suits your idea first, then
draw a more complete flow to show this.
Regards,
Yi Liu
> Jason
On Fri, Apr 02, 2021 at 07:30:23AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Friday, April 2, 2021 12:04 AM
> >
> > On Thu, Apr 01, 2021 at 02:08:17PM +0000, Liu, Yi L wrote:
> >
> > > DMA page faults are delivered to root-complex via page request message
> > and
> > > it is per-device according to PCIe spec. Page request handling flow is:
> > >
> > > 1) iommu driver receives a page request from device
> > > 2) iommu driver parses the page request message. Get the RID,PASID,
> > faulted
> > > page and requested permissions etc.
> > > 3) iommu driver triggers fault handler registered by device driver with
> > > iommu_report_device_fault()
> >
> > This seems confused.
> >
> > The PASID should define how to handle the page fault, not the driver.
> >
> > I don't remember any device specific actions in ATS, so what is the
> > driver supposed to do?
> >
> > > 4) device driver's fault handler signals an event FD to notify userspace to
> > > fetch the information about the page fault. If it's VM case, inject the
> > > page fault to VM and let guest to solve it.
> >
> > If the PASID is set to 'report page fault to userspace' then some
> > event should come out of /dev/ioasid, or be reported to a linked
> > eventfd, or whatever.
> >
> > If the PASID is set to 'SVM' then the fault should be passed to
> > handle_mm_fault
> >
> > And so on.
> >
> > Userspace chooses what happens based on how they configure the PASID
> > through /dev/ioasid.
> >
> > Why would a device driver get involved here?
> >
> > > Eric has sent below series for the page fault reporting for VM with passthru
> > > device.
> > > https://lore.kernel.org/kvm/20210223210625.604517-5-
> > [email protected]/
> >
> > It certainly should not be in vfio pci. Everything using a PASID needs
> > this infrastructure, VDPA, mdev, PCI, CXL, etc.
> >
>
> This touches an interesting fact:
>
> The fault may be triggered in either 1st-level or 2nd-level page table,
> when nested translation is enabled (in vSVA case). The 1st-level is bound
> by the user space, which therefore needs to receive the fault event. The
> 2nd-level is managed by VFIO (or vDPA), which needs to fix the fault in
> kernel (e.g. find HVA per faulting GPA, call handle_mm_fault and map
> GPA->HPA to IOMMU). Yi's current proposal lets VFIO to register the
> device fault handler, which then forward the event through /dev/ioasid
> to userspace only if it is a 1st-level fault. Are you suggesting a pgtable-
> centric fault reporting mechanism to separate handlers in each level,
> i.e. letting VFIO register handler only for 2nd-level fault and then /dev/
> ioasid register handler for 1st-level fault?
This I'm struggling to understand. /dev/ioasid should handle all the
faults cases, why would VFIO ever get involved in a fault? What would
it even do?
If the fault needs to be fixed in the hypervisor then it is a kernel
fault and it does handle_mm_fault. This absolutely should not be in
VFIO or VDPA
If the fault needs to be fixed in the guest, then it needs to be
delivered over /dev/ioasid in some way and injected into the
vIOMMU. VFIO and VDPA have nothing to do with vIOMMU driver in quemu.
You need to have an interface under /dev/ioasid to create both page
table levels and part of that will be to tell the kernel what VA is
mapped and how to handle faults.
VFIO/VDPA do *nothing* more than authorize the physical device to use
the given PASID.
In the VDPA case you might need to have SW access to the PASID, but
that should be provided by a generic iommu layer interface like
'copy_to/from_pasid()' not by involving VDPA in the address mapping.
Jason
On Fri, Apr 02, 2021 at 07:58:02AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Thursday, April 1, 2021 9:47 PM
> >
> > On Thu, Apr 01, 2021 at 01:43:36PM +0000, Liu, Yi L wrote:
> > > > From: Jason Gunthorpe <[email protected]>
> > > > Sent: Thursday, April 1, 2021 9:16 PM
> > > >
> > > > On Thu, Apr 01, 2021 at 01:10:48PM +0000, Liu, Yi L wrote:
> > > > > > From: Jason Gunthorpe <[email protected]>
> > > > > > Sent: Thursday, April 1, 2021 7:47 PM
> > > > > [...]
> > > > > > I'm worried Intel views the only use of PASID in a guest is with
> > > > > > ENQCMD, but that is not consistent with the industry. We need to see
> > > > > > normal nested PASID support with assigned PCI VFs.
> > > > >
> > > > > I'm not quire flow here. Intel also allows PASID usage in guest without
> > > > > ENQCMD. e.g. Passthru a PF to guest, and use PASID on it without
> > > > ENQCMD.
> > > >
> > > > Then you need all the parts, the hypervisor calls from the vIOMMU, and
> > > > you can't really use a vPASID.
> > >
> > > This is a diagram shows the vSVA setup.
> >
> > I'm not talking only about vSVA. Generic PASID support with arbitary
> > mappings.
> >
> > And how do you deal with the vPASID vs pPASID issue if the system has
> > a mix of physical devices and mdevs?
> >
>
> We plan to support two schemes. One is vPASID identity-mapped to
> pPASID then the mixed scenario just works, with the limitation of
> lacking of live migration support. The other is non-identity-mapped
> scheme, where live migration is supported but physical devices and
> mdevs should not be mixed in one VM if both expose SVA capability
> (requires some filtering check in Qemu).
That just becomes "block vPASID support if any device that
doesn't use ENQCMD is plugged into the guest"
Which needs a special VFIO capability of some kind so qemu knows to
block it. This really needs to all be layed out together so someone
can understand it :(
Why doesn't the siov cookbook explaining this stuff??
> We hope the /dev/ioasid can support both schemes, with the minimal
> requirement of allowing userspace to tag a vPASID to a pPASID and
> allowing mdev to translate vPASID into pPASID, i.e. not assuming that
> the guest will always use pPASID.
What I'm a unclear of is if /dev/ioasid even needs to care about
vPASID or if vPASID is just a hidden artifact of the KVM connection to
setup the translation table and the vIOMMU driver in qemu.
Since the physical HW never sees the vPASID I'm inclined to think the
latter.
Jason
On Fri, Apr 02, 2021 at 08:22:28AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Tuesday, March 30, 2021 9:29 PM
> >
> > >
> > > First, userspace may use ioasid in a non-SVA scenario where ioasid is
> > > bound to specific security context (e.g. a control vq in vDPA) instead of
> > > tying to mm. In this case there is no pgtable binding initiated from user
> > > space. Instead, ioasid is allocated from /dev/ioasid and then programmed
> > > to the intended security context through specific passthrough framework
> > > which manages that context.
> >
> > This sounds like the exact opposite of what I'd like to see.
> >
> > I do not want to see every subsystem gaining APIs to program a
> > PASID. All of that should be consolidated in *one place*.
> >
> > I do not want to see VDPA and VFIO have two nearly identical sets of
> > APIs to control the PASID.
> >
> > Drivers consuming a PASID, like VDPA, should consume the PASID and do
> > nothing more than authorize the HW to use it.
> >
> > quemu should have general code under the viommu driver that drives
> > /dev/ioasid to create PASID's and manage the IO mapping according to
> > the guest's needs.
> >
> > Drivers like VDPA and VFIO should simply accept that PASID and
> > configure/authorize their HW to do DMA's with its tag.
> >
>
> I agree with you on consolidating things in one place (especially for the
> general SVA support). But here I was referring to an usage without
> pgtable binding (Possibly Jason. W can say more here), where the
> userspace just wants to allocate PASIDs, program/accept PASIDs to
> various workqueues (device specific), and then use MAP/UNMAP
> interface to manage address spaces associated with each PASID.
> I just wanted to point out that the latter two steps are through
> VFIO/VDPA specific interfaces.
No, don't do that.
VFIO and VDPA has no buisness having map/unmap interfaces once we have
/dev/ioasid. That all belongs in the iosaid side.
I know they have those interfaces today, but that doesn't mean we have
to keep using them for PASID use cases, they should be replaced with a
'do dma from this pasid on /dev/ioasid' interface certainly not a
'here is a pasid from /dev/ioasid, go ahead and configure it youself'
interface
This is because PASID is *complicated* in the general case! For
instance all the two level stuff you are talking about must not leak
into every user!
Jason
> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, April 6, 2021 7:35 AM
>
> On Fri, Apr 02, 2021 at 07:30:23AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Friday, April 2, 2021 12:04 AM
> > >
> > > On Thu, Apr 01, 2021 at 02:08:17PM +0000, Liu, Yi L wrote:
> > >
> > > > DMA page faults are delivered to root-complex via page request
> message
> > > and
> > > > it is per-device according to PCIe spec. Page request handling flow is:
> > > >
> > > > 1) iommu driver receives a page request from device
> > > > 2) iommu driver parses the page request message. Get the RID,PASID,
> > > faulted
> > > > page and requested permissions etc.
> > > > 3) iommu driver triggers fault handler registered by device driver with
> > > > iommu_report_device_fault()
> > >
> > > This seems confused.
> > >
> > > The PASID should define how to handle the page fault, not the driver.
> > >
> > > I don't remember any device specific actions in ATS, so what is the
> > > driver supposed to do?
> > >
> > > > 4) device driver's fault handler signals an event FD to notify userspace
> to
> > > > fetch the information about the page fault. If it's VM case, inject the
> > > > page fault to VM and let guest to solve it.
> > >
> > > If the PASID is set to 'report page fault to userspace' then some
> > > event should come out of /dev/ioasid, or be reported to a linked
> > > eventfd, or whatever.
> > >
> > > If the PASID is set to 'SVM' then the fault should be passed to
> > > handle_mm_fault
> > >
> > > And so on.
> > >
> > > Userspace chooses what happens based on how they configure the PASID
> > > through /dev/ioasid.
> > >
> > > Why would a device driver get involved here?
> > >
> > > > Eric has sent below series for the page fault reporting for VM with
> passthru
> > > > device.
> > > > https://lore.kernel.org/kvm/20210223210625.604517-5-
> > > [email protected]/
> > >
> > > It certainly should not be in vfio pci. Everything using a PASID needs
> > > this infrastructure, VDPA, mdev, PCI, CXL, etc.
> > >
> >
> > This touches an interesting fact:
> >
> > The fault may be triggered in either 1st-level or 2nd-level page table,
> > when nested translation is enabled (in vSVA case). The 1st-level is bound
> > by the user space, which therefore needs to receive the fault event. The
> > 2nd-level is managed by VFIO (or vDPA), which needs to fix the fault in
> > kernel (e.g. find HVA per faulting GPA, call handle_mm_fault and map
> > GPA->HPA to IOMMU). Yi's current proposal lets VFIO to register the
> > device fault handler, which then forward the event through /dev/ioasid
> > to userspace only if it is a 1st-level fault. Are you suggesting a pgtable-
> > centric fault reporting mechanism to separate handlers in each level,
> > i.e. letting VFIO register handler only for 2nd-level fault and then /dev/
> > ioasid register handler for 1st-level fault?
>
> This I'm struggling to understand. /dev/ioasid should handle all the
> faults cases, why would VFIO ever get involved in a fault? What would
> it even do?
>
> If the fault needs to be fixed in the hypervisor then it is a kernel
> fault and it does handle_mm_fault. This absolutely should not be in
> VFIO or VDPA
With nested translation it is GVA->GPA->HPA. The kernel needs to
fix fault related to GPA->HPA (managed by VFIO/VDPA) while
handle_mm_fault only handles HVA->HPA. In this case, the 2nd-level
page fault is expected to be delivered to VFIO/VDPA first which then
find HVA related to GPA, call handle_mm_fault to fix HVA->HPA,
and then call iommu_map to fix GPA->HPA in the IOMMU page table.
This is exactly like how CPU EPT violation is handled.
>
> If the fault needs to be fixed in the guest, then it needs to be
> delivered over /dev/ioasid in some way and injected into the
> vIOMMU. VFIO and VDPA have nothing to do with vIOMMU driver in quemu.
>
> You need to have an interface under /dev/ioasid to create both page
> table levels and part of that will be to tell the kernel what VA is
> mapped and how to handle faults.
VFIO/VDPA already have their own interface to manage GPA->HPA
mappings. Why do we want to duplicate it in /dev/ioasid?
Thanks
Kevin
> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, April 6, 2021 7:40 AM
>
> On Fri, Apr 02, 2021 at 07:58:02AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Thursday, April 1, 2021 9:47 PM
> > >
> > > On Thu, Apr 01, 2021 at 01:43:36PM +0000, Liu, Yi L wrote:
> > > > > From: Jason Gunthorpe <[email protected]>
> > > > > Sent: Thursday, April 1, 2021 9:16 PM
> > > > >
> > > > > On Thu, Apr 01, 2021 at 01:10:48PM +0000, Liu, Yi L wrote:
> > > > > > > From: Jason Gunthorpe <[email protected]>
> > > > > > > Sent: Thursday, April 1, 2021 7:47 PM
> > > > > > [...]
> > > > > > > I'm worried Intel views the only use of PASID in a guest is with
> > > > > > > ENQCMD, but that is not consistent with the industry. We need to
> see
> > > > > > > normal nested PASID support with assigned PCI VFs.
> > > > > >
> > > > > > I'm not quire flow here. Intel also allows PASID usage in guest
> without
> > > > > > ENQCMD. e.g. Passthru a PF to guest, and use PASID on it without
> > > > > ENQCMD.
> > > > >
> > > > > Then you need all the parts, the hypervisor calls from the vIOMMU,
> and
> > > > > you can't really use a vPASID.
> > > >
> > > > This is a diagram shows the vSVA setup.
> > >
> > > I'm not talking only about vSVA. Generic PASID support with arbitary
> > > mappings.
> > >
> > > And how do you deal with the vPASID vs pPASID issue if the system has
> > > a mix of physical devices and mdevs?
> > >
> >
> > We plan to support two schemes. One is vPASID identity-mapped to
> > pPASID then the mixed scenario just works, with the limitation of
> > lacking of live migration support. The other is non-identity-mapped
> > scheme, where live migration is supported but physical devices and
> > mdevs should not be mixed in one VM if both expose SVA capability
> > (requires some filtering check in Qemu).
>
> That just becomes "block vPASID support if any device that
> doesn't use ENQCMD is plugged into the guest"
The limitation is only for physical device. and in reality it is not that
bad. To support live migration with physical device we anyway need
additional work to migrate the device state (e.g. based on Max's work),
then it's not unreasonable to also mediate guest programming of
device specific PASID register to enable vPASID (need to translate in
the whole VM lifespan but likely is not a hot path).
>
> Which needs a special VFIO capability of some kind so qemu knows to
> block it. This really needs to all be layed out together so someone
> can understand it :(
Or could simply based on whether the VFIO device supports live migration.
>
> Why doesn't the siov cookbook explaining this stuff??
>
> > We hope the /dev/ioasid can support both schemes, with the minimal
> > requirement of allowing userspace to tag a vPASID to a pPASID and
> > allowing mdev to translate vPASID into pPASID, i.e. not assuming that
> > the guest will always use pPASID.
>
> What I'm a unclear of is if /dev/ioasid even needs to care about
> vPASID or if vPASID is just a hidden artifact of the KVM connection to
> setup the translation table and the vIOMMU driver in qemu.
Not just for KVM. Also required by mdev, which needs to translate
vPASID into pPASID when ENQCMD is not used. As I replied in another
mail, possibly we don't need /dev/ioasid to know this fact, which
should only care about the operations related to pPASID. VFIO could
carry vPASID information to mdev. KVM should have its own interface
to know this information, as you suggested earlier.
Thanks
Kevin
?? 2021/4/6 ????7:42, Jason Gunthorpe д??:
> On Fri, Apr 02, 2021 at 08:22:28AM +0000, Tian, Kevin wrote:
>>> From: Jason Gunthorpe <[email protected]>
>>> Sent: Tuesday, March 30, 2021 9:29 PM
>>>
>>>> First, userspace may use ioasid in a non-SVA scenario where ioasid is
>>>> bound to specific security context (e.g. a control vq in vDPA) instead of
>>>> tying to mm. In this case there is no pgtable binding initiated from user
>>>> space. Instead, ioasid is allocated from /dev/ioasid and then programmed
>>>> to the intended security context through specific passthrough framework
>>>> which manages that context.
>>> This sounds like the exact opposite of what I'd like to see.
>>>
>>> I do not want to see every subsystem gaining APIs to program a
>>> PASID. All of that should be consolidated in *one place*.
>>>
>>> I do not want to see VDPA and VFIO have two nearly identical sets of
>>> APIs to control the PASID.
>>>
>>> Drivers consuming a PASID, like VDPA, should consume the PASID and do
>>> nothing more than authorize the HW to use it.
>>>
>>> quemu should have general code under the viommu driver that drives
>>> /dev/ioasid to create PASID's and manage the IO mapping according to
>>> the guest's needs.
>>>
>>> Drivers like VDPA and VFIO should simply accept that PASID and
>>> configure/authorize their HW to do DMA's with its tag.
>>>
>> I agree with you on consolidating things in one place (especially for the
>> general SVA support). But here I was referring to an usage without
>> pgtable binding (Possibly Jason. W can say more here), where the
>> userspace just wants to allocate PASIDs, program/accept PASIDs to
>> various workqueues (device specific), and then use MAP/UNMAP
>> interface to manage address spaces associated with each PASID.
>> I just wanted to point out that the latter two steps are through
>> VFIO/VDPA specific interfaces.
> No, don't do that.
>
> VFIO and VDPA has no buisness having map/unmap interfaces once we have
> /dev/ioasid. That all belongs in the iosaid side.
>
> I know they have those interfaces today, but that doesn't mean we have
> to keep using them for PASID use cases, they should be replaced with a
> 'do dma from this pasid on /dev/ioasid' interface certainly not a
> 'here is a pasid from /dev/ioasid, go ahead and configure it youself'
> interface
So it looks like the PASID was bound to SVA in this design. I think it's
not necessairly the case:
1) PASID can be implemented without SVA, in this case a map/unmap
interface is still required
2) For the case that hypervisor want to do some mediation in the middle
for a virtqueue. e.g in the case of control vq that is implemented in
the VF/ADI/SF itself, the hardware virtqueue needs to be controlled by
Qemu, Though binding qemu's page table to cvq can work but it looks like
a overkill, a small dedicated buffers that is mapped for this PASID
seems more suitalbe.
>
> This is because PASID is *complicated* in the general case! For
> instance all the two level stuff you are talking about must not leak
> into every user!
>
> Jason
So do you mean the device should not expose the PASID confiugration API
to guest? I think it could happen if we assign the whole device and let
guest to configure it for nested VMs.
Thanks
>
> From: Jason Gunthorpe
> Sent: Tuesday, April 6, 2021 7:43 AM
>
> On Fri, Apr 02, 2021 at 08:22:28AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Tuesday, March 30, 2021 9:29 PM
> > >
> > > >
> > > > First, userspace may use ioasid in a non-SVA scenario where ioasid is
> > > > bound to specific security context (e.g. a control vq in vDPA) instead of
> > > > tying to mm. In this case there is no pgtable binding initiated from user
> > > > space. Instead, ioasid is allocated from /dev/ioasid and then
> programmed
> > > > to the intended security context through specific passthrough
> framework
> > > > which manages that context.
> > >
> > > This sounds like the exact opposite of what I'd like to see.
> > >
> > > I do not want to see every subsystem gaining APIs to program a
> > > PASID. All of that should be consolidated in *one place*.
> > >
> > > I do not want to see VDPA and VFIO have two nearly identical sets of
> > > APIs to control the PASID.
> > >
> > > Drivers consuming a PASID, like VDPA, should consume the PASID and do
> > > nothing more than authorize the HW to use it.
> > >
> > > quemu should have general code under the viommu driver that drives
> > > /dev/ioasid to create PASID's and manage the IO mapping according to
> > > the guest's needs.
> > >
> > > Drivers like VDPA and VFIO should simply accept that PASID and
> > > configure/authorize their HW to do DMA's with its tag.
> > >
> >
> > I agree with you on consolidating things in one place (especially for the
> > general SVA support). But here I was referring to an usage without
> > pgtable binding (Possibly Jason. W can say more here), where the
> > userspace just wants to allocate PASIDs, program/accept PASIDs to
> > various workqueues (device specific), and then use MAP/UNMAP
> > interface to manage address spaces associated with each PASID.
> > I just wanted to point out that the latter two steps are through
> > VFIO/VDPA specific interfaces.
>
> No, don't do that.
>
> VFIO and VDPA has no buisness having map/unmap interfaces once we have
> /dev/ioasid. That all belongs in the iosaid side.
>
> I know they have those interfaces today, but that doesn't mean we have
> to keep using them for PASID use cases, they should be replaced with a
> 'do dma from this pasid on /dev/ioasid' interface certainly not a
> 'here is a pasid from /dev/ioasid, go ahead and configure it youself'
> interface
>
> This is because PASID is *complicated* in the general case! For
> instance all the two level stuff you are talking about must not leak
> into every user!
>
Hi, Jason,
I didn't get your last comment how the two level stuff is leaked into every
user. Could you elaborate it a bit?
and here is one example why using existing VFIO/VDPA interface makes
sense. say dev1 (w/ sva) and dev2 (w/o sva) are placed in a single VFIO
container. The container is associated to an iommu domain which contains
a single 2nd-level page table, shared by both devices (when attached to
the domain). The VFIO MAP operation is applied to the 2nd-level page
table thus naturally applied to both devices. Then userspace could use
/dev/ioasid to further allocate IOASIDs and bind multiple 1st-level page
tables for dev1, nested on the shared 2nd-level page table.
If following your suggestion then VFIO must deny VFIO MAP operations
on sva1 (assume userspace should not mix sva1 and sva2 in the same
container and instead use /dev/ioasid to map for sva1)? and even for
a sva-capable device there is a window before the guest actually enables
sva on that device then VFIO should still accept MAP in that window
and then deny it after sva is enabled by the guest? This all sounds
unnecessary complex while there is already a clean way to achieve it...
Thanks
Kevin
> From: Tian, Kevin
> Sent: Tuesday, April 6, 2021 9:02 AM
>
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Tuesday, April 6, 2021 7:40 AM
> >
> > On Fri, Apr 02, 2021 at 07:58:02AM +0000, Tian, Kevin wrote:
> > > > From: Jason Gunthorpe <[email protected]>
> > > > Sent: Thursday, April 1, 2021 9:47 PM
> > > >
> > > > On Thu, Apr 01, 2021 at 01:43:36PM +0000, Liu, Yi L wrote:
> > > > > > From: Jason Gunthorpe <[email protected]>
> > > > > > Sent: Thursday, April 1, 2021 9:16 PM
> > > > > >
> > > > > > On Thu, Apr 01, 2021 at 01:10:48PM +0000, Liu, Yi L wrote:
> > > > > > > > From: Jason Gunthorpe <[email protected]>
> > > > > > > > Sent: Thursday, April 1, 2021 7:47 PM
> > > > > > > [...]
> > > > > > > > I'm worried Intel views the only use of PASID in a guest is with
> > > > > > > > ENQCMD, but that is not consistent with the industry. We need
> to
> > see
> > > > > > > > normal nested PASID support with assigned PCI VFs.
> > > > > > >
> > > > > > > I'm not quire flow here. Intel also allows PASID usage in guest
> > without
> > > > > > > ENQCMD. e.g. Passthru a PF to guest, and use PASID on it without
> > > > > > ENQCMD.
> > > > > >
> > > > > > Then you need all the parts, the hypervisor calls from the vIOMMU,
> > and
> > > > > > you can't really use a vPASID.
> > > > >
> > > > > This is a diagram shows the vSVA setup.
> > > >
> > > > I'm not talking only about vSVA. Generic PASID support with arbitary
> > > > mappings.
> > > >
> > > > And how do you deal with the vPASID vs pPASID issue if the system has
> > > > a mix of physical devices and mdevs?
> > > >
> > >
> > > We plan to support two schemes. One is vPASID identity-mapped to
> > > pPASID then the mixed scenario just works, with the limitation of
> > > lacking of live migration support. The other is non-identity-mapped
> > > scheme, where live migration is supported but physical devices and
> > > mdevs should not be mixed in one VM if both expose SVA capability
> > > (requires some filtering check in Qemu).
> >
> > That just becomes "block vPASID support if any device that
> > doesn't use ENQCMD is plugged into the guest"
>
> The limitation is only for physical device. and in reality it is not that
> bad. To support live migration with physical device we anyway need
> additional work to migrate the device state (e.g. based on Max's work),
> then it's not unreasonable to also mediate guest programming of
> device specific PASID register to enable vPASID (need to translate in
> the whole VM lifespan but likely is not a hot path).
>
> >
> > Which needs a special VFIO capability of some kind so qemu knows to
> > block it. This really needs to all be layed out together so someone
> > can understand it :(
>
> Or could simply based on whether the VFIO device supports live migration.
Actually you are right on this point. VFIO should provide a per-device
capability to indicate whether vPASID is allowed on this device. likely
yes for mdev, by default no for pdev (unless explicitly opt in). Qemu
should enable vPASID only if all assigned devices support it, and then
provide vPASID information when using VFIO API to allow pPASIDs.
>
> >
> > Why doesn't the siov cookbook explaining this stuff??
> >
> > > We hope the /dev/ioasid can support both schemes, with the minimal
> > > requirement of allowing userspace to tag a vPASID to a pPASID and
> > > allowing mdev to translate vPASID into pPASID, i.e. not assuming that
> > > the guest will always use pPASID.
> >
> > What I'm a unclear of is if /dev/ioasid even needs to care about
> > vPASID or if vPASID is just a hidden artifact of the KVM connection to
> > setup the translation table and the vIOMMU driver in qemu.
>
> Not just for KVM. Also required by mdev, which needs to translate
> vPASID into pPASID when ENQCMD is not used. As I replied in another
> mail, possibly we don't need /dev/ioasid to know this fact, which
> should only care about the operations related to pPASID. VFIO could
> carry vPASID information to mdev. KVM should have its own interface
> to know this information, as you suggested earlier.
>
> Thanks
> Kevin
On Tue, Apr 06, 2021 at 09:35:17AM +0800, Jason Wang wrote:
> > VFIO and VDPA has no buisness having map/unmap interfaces once we have
> > /dev/ioasid. That all belongs in the iosaid side.
> >
> > I know they have those interfaces today, but that doesn't mean we have
> > to keep using them for PASID use cases, they should be replaced with a
> > 'do dma from this pasid on /dev/ioasid' interface certainly not a
> > 'here is a pasid from /dev/ioasid, go ahead and configure it youself'
> > interface
>
> So it looks like the PASID was bound to SVA in this design. I think it's not
> necessairly the case:
No, I wish people would stop talking about SVA.
SVA and vSVA are a very special narrow configuration of a PASID. There
are lots of other PASID configurations! That is the whole point, a
PASID is complicated, there are many configuration scenarios, they
need to be in one place with a very clearly defined uAPI
> 1) PASID can be implemented without SVA, in this case a map/unmap interface
> is still required
Any interface to manipulate a PASID should be under /dev/ioasid. We do
not want to duplicate this into every subsystem.
> 2) For the case that hypervisor want to do some mediation in the middle for
> a virtqueue. e.g in the case of control vq that is implemented in the
> VF/ADI/SF itself, the hardware virtqueue needs to be controlled by Qemu,
> Though binding qemu's page table to cvq can work but it looks like a
> overkill, a small dedicated buffers that is mapped for this PASID seems more
> suitalbe.
/dev/ioasid should allow userspace to setup any PASID configuration it
wants. There are many choices. That is the whole point, instead of
copying&pasting all the PASID configuration option into every
subsystem we have on place to configure it.
If you want a PASID (or generic ioasid) that has the guest physical
map, which is probably all that VDPA would ever want, then /dev/ioasid
should be able to prepare that.
If you just want to map a few buffers into a PASID then it should be
able to do that too.
> So do you mean the device should not expose the PASID confiugration API to
> guest? I think it could happen if we assign the whole device and let guest
> to configure it for nested VMs.
This always needs co-operating with the vIOMMU driver. We can't have
nested PASID use without both parts working together.
The vIOMMU driver configures the PASID and assigns the mappings
(however complicated that turns out to be)
The VDPA/mdev driver authorizes the HW to use the ioasid mapping, eg
by authorizing a queue to issue PCIe TLPs with a specific PASID.
The authorization is triggered by the guest telling the vIOMMU to
allow a vRID to talk to a PASID, which qemu would have to translate to
telling something like the VDPA driver under the vRID that it can use
a PASID from /dev/ioasid
For security a VDPA/mdev device MUST NOT issue PASIDs that the vIOMMU
has not authorized its vRID to use. Otherwise the security model of
something like VFIO in the guest becomes completely broken.
Jason
On Tue, Apr 06, 2021 at 01:27:15AM +0000, Tian, Kevin wrote:
>
> and here is one example why using existing VFIO/VDPA interface makes
> sense. say dev1 (w/ sva) and dev2 (w/o sva) are placed in a single VFIO
> container.
Forget about SVA, it is an irrelevant detail of how a PASID is
configured.
> The container is associated to an iommu domain which contains a
> single 2nd-level page table, shared by both devices (when attached
> to the domain).
This level should be described by an ioasid.
> The VFIO MAP operation is applied to the 2nd-level
> page table thus naturally applied to both devices. Then userspace
> could use /dev/ioasid to further allocate IOASIDs and bind multiple
> 1st-level page tables for dev1, nested on the shared 2nd-level page
> table.
Because if you don't then we enter insane world where a PASID is being
created under /dev/ioasid but its translation path flows through setup
done by VFIO and the whole user API becomes an incomprehensible mess.
How will you even associate the PASID with the other translation??
The entire translation path for any ioasid or PASID should be defined
only by /dev/ioasid. Everything else is a legacy API.
> If following your suggestion then VFIO must deny VFIO MAP operations
> on sva1 (assume userspace should not mix sva1 and sva2 in the same
> container and instead use /dev/ioasid to map for sva1)?
No, userspace creates an iosaid for the guest physical mapping and
passes this ioasid to VFIO PCI which will assign it as the first layer
mapping on the RID
When PASIDs are allocated the uAPI will be told to logically nested
under the first ioasid. When VFIO authorizes a PASID for a RID it
checks that all the HW rules are being followed.
If there are rules like groups of VFIO devices must always use the
same IOASID then VFIO will check these too (and realistically qemu
will have only one guest physical map ioasid anyhow)
There is no real difference between setting up an IOMMU table for a
(RID,PASID) tuple or just a RID. We can do it universally with
one interface for all consumers.
I wanted this when we were doing VDPA for the first time, now that we
are doing pasid and more difficult stuff I view it as essential.
Jason
On Tue, Apr 06, 2021 at 12:37:35AM +0000, Tian, Kevin wrote:
> With nested translation it is GVA->GPA->HPA. The kernel needs to
> fix fault related to GPA->HPA (managed by VFIO/VDPA) while
> handle_mm_fault only handles HVA->HPA. In this case, the 2nd-level
> page fault is expected to be delivered to VFIO/VDPA first which then
> find HVA related to GPA, call handle_mm_fault to fix HVA->HPA,
> and then call iommu_map to fix GPA->HPA in the IOMMU page table.
> This is exactly like how CPU EPT violation is handled.
No, it should all be in the /dev/ioasid layer not duplicated into
every user.
> > If the fault needs to be fixed in the guest, then it needs to be
> > delivered over /dev/ioasid in some way and injected into the
> > vIOMMU. VFIO and VDPA have nothing to do with vIOMMU driver in quemu.
> >
> > You need to have an interface under /dev/ioasid to create both page
> > table levels and part of that will be to tell the kernel what VA is
> > mapped and how to handle faults.
>
> VFIO/VDPA already have their own interface to manage GPA->HPA
> mappings. Why do we want to duplicate it in /dev/ioasid?
They have their own interface to manage other types of HW, we should
not duplicate PASID programming into there too.
Jason
On Tue, Apr 06, 2021 at 01:02:05AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Tuesday, April 6, 2021 7:40 AM
> >
> > On Fri, Apr 02, 2021 at 07:58:02AM +0000, Tian, Kevin wrote:
> > > > From: Jason Gunthorpe <[email protected]>
> > > > Sent: Thursday, April 1, 2021 9:47 PM
> > > >
> > > > On Thu, Apr 01, 2021 at 01:43:36PM +0000, Liu, Yi L wrote:
> > > > > > From: Jason Gunthorpe <[email protected]>
> > > > > > Sent: Thursday, April 1, 2021 9:16 PM
> > > > > >
> > > > > > On Thu, Apr 01, 2021 at 01:10:48PM +0000, Liu, Yi L wrote:
> > > > > > > > From: Jason Gunthorpe <[email protected]>
> > > > > > > > Sent: Thursday, April 1, 2021 7:47 PM
> > > > > > > [...]
> > > > > > > > I'm worried Intel views the only use of PASID in a guest is with
> > > > > > > > ENQCMD, but that is not consistent with the industry. We need to
> > see
> > > > > > > > normal nested PASID support with assigned PCI VFs.
> > > > > > >
> > > > > > > I'm not quire flow here. Intel also allows PASID usage in guest
> > without
> > > > > > > ENQCMD. e.g. Passthru a PF to guest, and use PASID on it without
> > > > > > ENQCMD.
> > > > > >
> > > > > > Then you need all the parts, the hypervisor calls from the vIOMMU,
> > and
> > > > > > you can't really use a vPASID.
> > > > >
> > > > > This is a diagram shows the vSVA setup.
> > > >
> > > > I'm not talking only about vSVA. Generic PASID support with arbitary
> > > > mappings.
> > > >
> > > > And how do you deal with the vPASID vs pPASID issue if the system has
> > > > a mix of physical devices and mdevs?
> > > >
> > >
> > > We plan to support two schemes. One is vPASID identity-mapped to
> > > pPASID then the mixed scenario just works, with the limitation of
> > > lacking of live migration support. The other is non-identity-mapped
> > > scheme, where live migration is supported but physical devices and
> > > mdevs should not be mixed in one VM if both expose SVA capability
> > > (requires some filtering check in Qemu).
> >
> > That just becomes "block vPASID support if any device that
> > doesn't use ENQCMD is plugged into the guest"
>
> The limitation is only for physical device. and in reality it is not that
> bad. To support live migration with physical device we anyway need
> additional work to migrate the device state (e.g. based on Max's work),
> then it's not unreasonable to also mediate guest programming of
> device specific PASID register to enable vPASID (need to translate in
> the whole VM lifespan but likely is not a hot path).
IMHO that is pretty unreasonable.. More likely we end up with vPASID
tables in each migratable device like KVM has.
> > Which needs a special VFIO capability of some kind so qemu knows to
> > block it. This really needs to all be layed out together so someone
> > can understand it :(
>
> Or could simply based on whether the VFIO device supports live migration.
You need to define affirmative caps that indicate that vPASID will be
supported by the VFIO device.
> > Why doesn't the siov cookbook explaining this stuff??
> >
> > > We hope the /dev/ioasid can support both schemes, with the minimal
> > > requirement of allowing userspace to tag a vPASID to a pPASID and
> > > allowing mdev to translate vPASID into pPASID, i.e. not assuming that
> > > the guest will always use pPASID.
> >
> > What I'm a unclear of is if /dev/ioasid even needs to care about
> > vPASID or if vPASID is just a hidden artifact of the KVM connection to
> > setup the translation table and the vIOMMU driver in qemu.
>
> Not just for KVM. Also required by mdev, which needs to translate
> vPASID into pPASID when ENQCMD is not used.
Do we have any mdev's that will do this?
> should only care about the operations related to pPASID. VFIO could
> carry vPASID information to mdev.
It depends how common this is, I suppose
Jason
?? 2021/4/6 ????8:42, Jason Gunthorpe д??:
> On Tue, Apr 06, 2021 at 09:35:17AM +0800, Jason Wang wrote:
>
>>> VFIO and VDPA has no buisness having map/unmap interfaces once we have
>>> /dev/ioasid. That all belongs in the iosaid side.
>>>
>>> I know they have those interfaces today, but that doesn't mean we have
>>> to keep using them for PASID use cases, they should be replaced with a
>>> 'do dma from this pasid on /dev/ioasid' interface certainly not a
>>> 'here is a pasid from /dev/ioasid, go ahead and configure it youself'
>>> interface
>>
>> So it looks like the PASID was bound to SVA in this design. I think it's not
>> necessairly the case:
> No, I wish people would stop talking about SVA.
>
> SVA and vSVA are a very special narrow configuration of a PASID. There
> are lots of other PASID configurations! That is the whole point, a
> PASID is complicated, there are many configuration scenarios, they
> need to be in one place with a very clearly defined uAPI
Right, that's my understanding as well.
>
>> 1) PASID can be implemented without SVA, in this case a map/unmap interface
>> is still required
> Any interface to manipulate a PASID should be under /dev/ioasid. We do
> not want to duplicate this into every subsystem.
Yes.
>
>> 2) For the case that hypervisor want to do some mediation in the middle for
>> a virtqueue. e.g in the case of control vq that is implemented in the
>> VF/ADI/SF itself, the hardware virtqueue needs to be controlled by Qemu,
>> Though binding qemu's page table to cvq can work but it looks like a
>> overkill, a small dedicated buffers that is mapped for this PASID seems more
>> suitalbe.
> /dev/ioasid should allow userspace to setup any PASID configuration it
> wants. There are many choices. That is the whole point, instead of
> copying&pasting all the PASID configuration option into every
> subsystem we have on place to configure it.
>
> If you want a PASID (or generic ioasid) that has the guest physical
> map, which is probably all that VDPA would ever want, then /dev/ioasid
> should be able to prepare that.
>
> If you just want to map a few buffers into a PASID then it should be
> able to do that too.
>
>> So do you mean the device should not expose the PASID confiugration API to
>> guest? I think it could happen if we assign the whole device and let guest
>> to configure it for nested VMs.
> This always needs co-operating with the vIOMMU driver. We can't have
> nested PASID use without both parts working together.
>
> The vIOMMU driver configures the PASID and assigns the mappings
> (however complicated that turns out to be)
>
> The VDPA/mdev driver authorizes the HW to use the ioasid mapping, eg
> by authorizing a queue to issue PCIe TLPs with a specific PASID.
>
> The authorization is triggered by the guest telling the vIOMMU to
> allow a vRID to talk to a PASID, which qemu would have to translate to
> telling something like the VDPA driver under the vRID that it can use
> a PASID from /dev/ioasid
>
> For security a VDPA/mdev device MUST NOT issue PASIDs that the vIOMMU
> has not authorized its vRID to use. Otherwise the security model of
> something like VFIO in the guest becomes completely broken.
Yes, that's how it should work.
Thanks
>
> Jason
>
> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, April 6, 2021 8:35 PM
>
> On Tue, Apr 06, 2021 at 01:27:15AM +0000, Tian, Kevin wrote:
> >
> > and here is one example why using existing VFIO/VDPA interface makes
> > sense. say dev1 (w/ sva) and dev2 (w/o sva) are placed in a single VFIO
> > container.
>
> Forget about SVA, it is an irrelevant detail of how a PASID is
> configured.
>
> > The container is associated to an iommu domain which contains a
> > single 2nd-level page table, shared by both devices (when attached
> > to the domain).
>
> This level should be described by an ioasid.
>
> > The VFIO MAP operation is applied to the 2nd-level
> > page table thus naturally applied to both devices. Then userspace
> > could use /dev/ioasid to further allocate IOASIDs and bind multiple
> > 1st-level page tables for dev1, nested on the shared 2nd-level page
> > table.
>
> Because if you don't then we enter insane world where a PASID is being
> created under /dev/ioasid but its translation path flows through setup
> done by VFIO and the whole user API becomes an incomprehensible mess.
>
> How will you even associate the PASID with the other translation??
PASID is attached to a specific iommu domain (created by VFIO/VDPA), which
has GPA->HPA mappings already configured. If we view that mapping as an
attribute of the iommu domain, it's reasonable to have the userspace-bound
pgtable through /dev/ioasid to nest on it.
>
> The entire translation path for any ioasid or PASID should be defined
> only by /dev/ioasid. Everything else is a legacy API.
>
> > If following your suggestion then VFIO must deny VFIO MAP operations
> > on sva1 (assume userspace should not mix sva1 and sva2 in the same
> > container and instead use /dev/ioasid to map for sva1)?
>
> No, userspace creates an iosaid for the guest physical mapping and
> passes this ioasid to VFIO PCI which will assign it as the first layer
> mapping on the RID
Is it an dummy ioasid just for providing GPA mappings for nesting purpose
of other IOASIDs? Then we waste one per VM?
>
> When PASIDs are allocated the uAPI will be told to logically nested
> under the first ioasid. When VFIO authorizes a PASID for a RID it
> checks that all the HW rules are being followed.
As I explained above, why cannot we just use iommu domain to connect
the dots? Every passthrough framework needs to create an iommu domain
first. and It needs to support both devices w/ PASID and devices w/o PASID.
For devices w/o PASID it needs to invent its own MAP interface anyway.
Then why do we bother creating another MAP interface through /dev/ioasid
which not only duplicates but also creating transition burden between
two set of MAP interfaces when the guest turns on/off the pasid capability
on the device?
>
> If there are rules like groups of VFIO devices must always use the
> same IOASID then VFIO will check these too (and realistically qemu
> will have only one guest physical map ioasid anyhow)
>
> There is no real difference between setting up an IOMMU table for a
> (RID,PASID) tuple or just a RID. We can do it universally with
> one interface for all consumers.
>
'universally' upon from which angle you look at this problem. From IOASID
p.o.v possibly yes, but from device passthrough p.o.v. it's the opposite
since the passthrough framework needs to handle devices w/o PASID anyway
(or even for device w/ PASID it could send traffic w/o PASID) thus 'universally'
makes more sense if the passthrough framework can use one interface of its
own to manage GPA mappings for all consumers (apply to the case when a
PASID is allowed/authorized).
Thanks
Kevin
> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, April 6, 2021 8:21 PM
>
> On Tue, Apr 06, 2021 at 01:02:05AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Tuesday, April 6, 2021 7:40 AM
> > >
> > > On Fri, Apr 02, 2021 at 07:58:02AM +0000, Tian, Kevin wrote:
> > > > > From: Jason Gunthorpe <[email protected]>
> > > > > Sent: Thursday, April 1, 2021 9:47 PM
> > > > >
> > > > > On Thu, Apr 01, 2021 at 01:43:36PM +0000, Liu, Yi L wrote:
> > > > > > > From: Jason Gunthorpe <[email protected]>
> > > > > > > Sent: Thursday, April 1, 2021 9:16 PM
> > > > > > >
> > > > > > > On Thu, Apr 01, 2021 at 01:10:48PM +0000, Liu, Yi L wrote:
> > > > > > > > > From: Jason Gunthorpe <[email protected]>
> > > > > > > > > Sent: Thursday, April 1, 2021 7:47 PM
> > > > > > > > [...]
> > > > > > > > > I'm worried Intel views the only use of PASID in a guest is with
> > > > > > > > > ENQCMD, but that is not consistent with the industry. We need
> to
> > > see
> > > > > > > > > normal nested PASID support with assigned PCI VFs.
> > > > > > > >
> > > > > > > > I'm not quire flow here. Intel also allows PASID usage in guest
> > > without
> > > > > > > > ENQCMD. e.g. Passthru a PF to guest, and use PASID on it
> without
> > > > > > > ENQCMD.
> > > > > > >
> > > > > > > Then you need all the parts, the hypervisor calls from the vIOMMU,
> > > and
> > > > > > > you can't really use a vPASID.
> > > > > >
> > > > > > This is a diagram shows the vSVA setup.
> > > > >
> > > > > I'm not talking only about vSVA. Generic PASID support with arbitary
> > > > > mappings.
> > > > >
> > > > > And how do you deal with the vPASID vs pPASID issue if the system
> has
> > > > > a mix of physical devices and mdevs?
> > > > >
> > > >
> > > > We plan to support two schemes. One is vPASID identity-mapped to
> > > > pPASID then the mixed scenario just works, with the limitation of
> > > > lacking of live migration support. The other is non-identity-mapped
> > > > scheme, where live migration is supported but physical devices and
> > > > mdevs should not be mixed in one VM if both expose SVA capability
> > > > (requires some filtering check in Qemu).
> > >
> > > That just becomes "block vPASID support if any device that
> > > doesn't use ENQCMD is plugged into the guest"
> >
> > The limitation is only for physical device. and in reality it is not that
> > bad. To support live migration with physical device we anyway need
> > additional work to migrate the device state (e.g. based on Max's work),
> > then it's not unreasonable to also mediate guest programming of
> > device specific PASID register to enable vPASID (need to translate in
> > the whole VM lifespan but likely is not a hot path).
>
> IMHO that is pretty unreasonable.. More likely we end up with vPASID
> tables in each migratable device like KVM has.
just like mdev needs to maintain allowed PASID list, this extends it to
all migratable devices.
>
> > > Which needs a special VFIO capability of some kind so qemu knows to
> > > block it. This really needs to all be layed out together so someone
> > > can understand it :(
> >
> > Or could simply based on whether the VFIO device supports live migration.
>
> You need to define affirmative caps that indicate that vPASID will be
> supported by the VFIO device.
Yes, this is required as I acked in another mail.
>
> > > Why doesn't the siov cookbook explaining this stuff??
> > >
> > > > We hope the /dev/ioasid can support both schemes, with the minimal
> > > > requirement of allowing userspace to tag a vPASID to a pPASID and
> > > > allowing mdev to translate vPASID into pPASID, i.e. not assuming that
> > > > the guest will always use pPASID.
> > >
> > > What I'm a unclear of is if /dev/ioasid even needs to care about
> > > vPASID or if vPASID is just a hidden artifact of the KVM connection to
> > > setup the translation table and the vIOMMU driver in qemu.
> >
> > Not just for KVM. Also required by mdev, which needs to translate
> > vPASID into pPASID when ENQCMD is not used.
>
> Do we have any mdev's that will do this?
definitely. Actually any mdev which doesn't do ENQCMD needs to do this.
In normal case, the PASID is programmed to a MMIO register (or in-memory
context) associate with the backend resource of the mdev. The value
programmed from the guest is vPASID, thus must be translated into pPASID
before updating the physical register.
>
> > should only care about the operations related to pPASID. VFIO could
> > carry vPASID information to mdev.
>
> It depends how common this is, I suppose
>
based on above I think it's a common case.
Thanks
Kevin
> From: Jason Gunthorpe
> Sent: Tuesday, April 6, 2021 8:43 PM
>
> On Tue, Apr 06, 2021 at 09:35:17AM +0800, Jason Wang wrote:
>
> > > VFIO and VDPA has no buisness having map/unmap interfaces once we
> have
> > > /dev/ioasid. That all belongs in the iosaid side.
> > >
> > > I know they have those interfaces today, but that doesn't mean we have
> > > to keep using them for PASID use cases, they should be replaced with a
> > > 'do dma from this pasid on /dev/ioasid' interface certainly not a
> > > 'here is a pasid from /dev/ioasid, go ahead and configure it youself'
> > > interface
> >
> > So it looks like the PASID was bound to SVA in this design. I think it's not
> > necessairly the case:
>
> No, I wish people would stop talking about SVA.
>
> SVA and vSVA are a very special narrow configuration of a PASID. There
> are lots of other PASID configurations! That is the whole point, a
> PASID is complicated, there are many configuration scenarios, they
> need to be in one place with a very clearly defined uAPI
>
I feel it also makes sense to allow a subsystem to specify which configurations
are permitted when allowing a PASID on its device, e.g. excluding things like
GPA mappings that existing subsystems (VFIO/VDPA) already handle well:
- Share GPA mappings between multiple devices (w/ or w/o PASID) for
better IOTLB efficiency;
- Share GPA mappings between transactions w/ PASID and transactions w/o
PASID from the same device (e.g. GPU) for better IOTLB efficiency;
- Use the same page table for GPA mappings before and after the guest
turns on/off the PASID capability;
All above are given as long as we continue to let VFIO/VDPA manage the
iommu domain and associated GPA mappings for PASID. The IOMMU driver
already ensures a nested PASID entry linking to the established GPA paging
structure of the domain when the 1st-level pgtable is bound through
/dev/ioasid.
In contrast, above merits are lost if forcing a model where GPA mappings
for PASID must be constructed through /dev/ioasid, as this will lead to
multiple paging structures for the same GPA mappings implying worse
IOTLB usage and unnecessary cost of invalidations.
Therefore, I envision a scheme where the subsystem could specify
permitted PASID configurations when doing ALLOW_PASID, and then
userspace queries per-PASID capability to learn which operations
are allowed, e.g.:
1) To enable vSVA, VFIO/VDPA allows pgtable binding and related invalidation/
fault ops through /dev/ioasid;
2) for vDPA control vq usage, no configuration is allowed through /dev/ioasid;
3) for new subsystem which doesn't carry any legacy or similar usage as
VFIO/VDPA, it could permit all configurations through /dev/ioasid including
1st-level binding and 2nd-level mapping ops;
This approach also allows us to grow the uAPI in a staging approach. Now
focus on 1) and 2) as VFIO/VDPA are the only two users for now with good
legacy to cover the GPA mappings. More ops can be introduced for 3) when
there is a real example to show what exact ops are required for such a new
subsystem.
Is this a good strategy to move forward?
btw this discussion was raised when discussing the I/O page fault handling
process. Currently the IOMMU layer implements a per-device fault reporting
mechanism, which requires VFIO to register a handler to receive all faults
on its device and then forwards to ioasid if it's due to 1st-level. Possibly it
makes more sense to convert it into a per-pgtable reporting scheme, and
then the owner of each pgtable should register its own handler. It means
for 1) VFIO will register a 2nd-level pgtable handler while /dev/ioasid
will register a 1st-level pgtable handler, while for 3) /dev/ioasid will register
handlers for both 1st-level and 2nd-level pgtable. Jean? also want to know
your thoughts...
Thanks
Kevin
On Wed, Apr 07, 2021 at 08:17:50AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe
> > Sent: Tuesday, April 6, 2021 8:43 PM
> >
> > On Tue, Apr 06, 2021 at 09:35:17AM +0800, Jason Wang wrote:
> >
> > > > VFIO and VDPA has no buisness having map/unmap interfaces once we
> > have
> > > > /dev/ioasid. That all belongs in the iosaid side.
> > > >
> > > > I know they have those interfaces today, but that doesn't mean we have
> > > > to keep using them for PASID use cases, they should be replaced with a
> > > > 'do dma from this pasid on /dev/ioasid' interface certainly not a
> > > > 'here is a pasid from /dev/ioasid, go ahead and configure it youself'
> > > > interface
> > >
> > > So it looks like the PASID was bound to SVA in this design. I think it's not
> > > necessairly the case:
> >
> > No, I wish people would stop talking about SVA.
> >
> > SVA and vSVA are a very special narrow configuration of a PASID. There
> > are lots of other PASID configurations! That is the whole point, a
> > PASID is complicated, there are many configuration scenarios, they
> > need to be in one place with a very clearly defined uAPI
> >
>
> I feel it also makes sense to allow a subsystem to specify which configurations
> are permitted when allowing a PASID on its device
huh? why?
> e.g. excluding things like
> GPA mappings that existing subsystems (VFIO/VDPA) already handle well:
They don't "handle well", they have some historical baggage that is no
longer suitable for the complexity this area has in the modern world.
Forget about the existing APIs and replace them in /dev/ioasid.
> - Share GPA mappings between multiple devices (w/ or w/o PASID) for
> better IOTLB efficiency;
>
> - Share GPA mappings between transactions w/ PASID and transactions w/o
> PASID from the same device (e.g. GPU) for better IOTLB efficiency;
>
> - Use the same page table for GPA mappings before and after the guest
> turns on/off the PASID capability;
All of these are cases you need to design the /dev/ioasid to handle.
It is pretty clear to me that you'll need non-PASID IOASID's as
well.
Ideally a generic IOASID would just be a page table and it doesn't
crystalize into a RID or RID,PASID routing until devices are attached
to it.
Since IOASID can be nested the only thing that makes any sense is for
each level of the nest to be visible under /dev/ioasid.
What a complete mess it would be if vfio-pci owns the GPA table,
/dev/ioasid has a nested PASID, and vfio-mdev is running a mdev on top
of that PASID.
> All above are given as long as we continue to let VFIO/VDPA manage the
> iommu domain and associated GPA mappings for PASID.
So don't do that. Don't I keep saying this weird split is making a
horrible mess?
You can't reasonably build the complex PASID scenarios you talk about
above unless the entire translation path is owned by one entity:
/dev/ioasid.
You need to focus on figuring out what that looks like then figure out
how to move VDPA and VFIO to consume /dev/ioasid for all of their
translation instead of open-coding half-baked internal versions.
Jason
On Wed, Apr 07, 2021 at 02:08:33AM +0000, Tian, Kevin wrote:
> > Because if you don't then we enter insane world where a PASID is being
> > created under /dev/ioasid but its translation path flows through setup
> > done by VFIO and the whole user API becomes an incomprehensible mess.
> >
> > How will you even associate the PASID with the other translation??
>
> PASID is attached to a specific iommu domain (created by VFIO/VDPA), which
> has GPA->HPA mappings already configured. If we view that mapping as an
> attribute of the iommu domain, it's reasonable to have the userspace-bound
> pgtable through /dev/ioasid to nest on it.
A user controlled page table should absolutely not be an attribute of
a hidden kernel object, nor should two parts of the kernel silently
connect to each other via a hidden internal objects like this.
Security is important - the kind of connection must use some explicit
FD authorization to access shared objects, not be made implicit!
IMHO this direction is a dead end for this reason.
> > The entire translation path for any ioasid or PASID should be defined
> > only by /dev/ioasid. Everything else is a legacy API.
> >
> > > If following your suggestion then VFIO must deny VFIO MAP operations
> > > on sva1 (assume userspace should not mix sva1 and sva2 in the same
> > > container and instead use /dev/ioasid to map for sva1)?
> >
> > No, userspace creates an iosaid for the guest physical mapping and
> > passes this ioasid to VFIO PCI which will assign it as the first layer
> > mapping on the RID
>
> Is it an dummy ioasid just for providing GPA mappings for nesting purpose
> of other IOASIDs? Then we waste one per VM?
Generic ioasid's are "free" they are just software constructs in the
kernel.
> > When PASIDs are allocated the uAPI will be told to logically nested
> > under the first ioasid. When VFIO authorizes a PASID for a RID it
> > checks that all the HW rules are being followed.
>
> As I explained above, why cannot we just use iommu domain to connect
> the dots?
Security.
> Every passthrough framework needs to create an iommu domain
> first. and It needs to support both devices w/ PASID and devices w/o
> PASID. For devices w/o PASID it needs to invent its own MAP
> interface anyway.
No, it should consume a ioasid from /dev/ioasid, use a common ioasid
map interface and assign that ioasid to a RID.
Don't get so fixated on PASID as a special case
> Then why do we bother creating another MAP interface through
> /dev/ioasid which not only duplicates but also creating transition
> burden between two set of MAP interfaces when the guest turns on/off
> the pasid capability on the device?
Don't transition. Always use the new interface. qemu detects the
kernel supports /dev/ioasid and *all iommu page table configuration*
goes through there. VFIO and VDPA APIs become unused for iommu
configuration.
> 'universally' upon from which angle you look at this problem. From IOASID
> p.o.v possibly yes, but from device passthrough p.o.v. it's the opposite
> since the passthrough framework needs to handle devices w/o PASID anyway
> (or even for device w/ PASID it could send traffic w/o PASID) thus 'universally'
> makes more sense if the passthrough framework can use one interface of its
> own to manage GPA mappings for all consumers (apply to the case when a
> PASID is allowed/authorized).
You correctly named it /dev/ioasid, it is a generic way to allocate,
manage and assign IOMMU page tables, which when generalized, only some
of which may consume a limited PASID.
RID and RID,PASID are the same thing, just a small difference in how
they match TLPs.
Jason
On Wed, Apr 07, 2021 at 08:17:50AM +0000, Tian, Kevin wrote:
> btw this discussion was raised when discussing the I/O page fault handling
> process. Currently the IOMMU layer implements a per-device fault reporting
> mechanism, which requires VFIO to register a handler to receive all faults
> on its device and then forwards to ioasid if it's due to 1st-level. Possibly it
> makes more sense to convert it into a per-pgtable reporting scheme, and
> then the owner of each pgtable should register its own handler.
Maybe, but you do need device information in there, since that's how the
fault is reported to the guest and how the response is routed back to the
faulting device (only PASID+PRGI would cause aliasing). And we need to
report non-recoverable faults, as well as recoverable ones without PASID,
once we hand control of level-1 page tables to guests.
> It means
> for 1) VFIO will register a 2nd-level pgtable handler while /dev/ioasid
> will register a 1st-level pgtable handler, while for 3) /dev/ioasid will register
> handlers for both 1st-level and 2nd-level pgtable. Jean? also want to know
> your thoughts...
Moving all IOMMU controls to /dev/ioasid rather that splitting them is
probably better. Hopefully the implementation can reuse most of
vfio_iommu_type1.
I'm trying to sketch what may work for Arm, if we have to reuse
/dev/ioasid to avoid duplication of fault and inval queues:
* Get a container handle out of /dev/ioasid (or /dev/iommu, really.)
No operation available since we don't know what the device and IOMMU
capabilities are.
* Attach the handle to a VF. With VFIO that would be
VFIO_GROUP_SET_CONTAINER. That causes the kernel to associate an IOMMU
with the handle, and decide which operations are available.
* With a map/unmap vIOMMU (or shadow mappings), a single translation level
is supported. With a nesting vIOMMU, we're populating the level-2
translation (some day maybe by binding the KVM page tables, but
currently with map/unmap ioctl).
Single-level translation needs single VF per container. Two level would
allow sharing stage-2 between multiple VFs, though it's a pain to define
and implement.
* Without a vIOMMU or if the vIOMMU starts in bypass, populate the
container page tables.
Start the guest.
* With a map/unmap vIOMMU, guest creates mappings, userspace populates the
page tables with map/unmap ioctl.
It would be possible to add a PASID mode there: guest requests an
address space with a specific PASID, userspace derives an IOASID handle
from the container handle and populate that address space with map/unmap
ioctl. That would enable PASID on sub-VF assignment, which requires the
host to control which PASID is programmed into the VF (with
DEVICE_ALLOW_IOASID, I guess). And either the host allocates the PASID
in this case (which isn't supported by a vSMMU) or we have to do a
vPASID -> pPASID. I don't know if it's worth the effort.
Or
* With a nesting vIOMMU, the guest attaches a PASID table to a VF,
userspace issues a SET_PASID_TABLE ioctl on the container handle. If
we support multiple VFs per container, we first need to derive a child
container from the main one and the device, then attach the PASID table.
Guest programs the PASID table, sends invalidations when removing
mappings which are relayed to the host on the child container. Page
faults and response queue would be per container, so if multiple VF per
container, we could have one queue for the parent (level-2 faults) and
one for each child (level-1 faults).
Thanks,
Jean
On Wed, Apr 07, 2021 at 08:43:50PM +0200, Jean-Philippe Brucker wrote:
> * Get a container handle out of /dev/ioasid (or /dev/iommu, really.)
> No operation available since we don't know what the device and IOMMU
> capabilities are.
>
> * Attach the handle to a VF. With VFIO that would be
> VFIO_GROUP_SET_CONTAINER. That causes the kernel to associate an IOMMU
> with the handle, and decide which operations are available.
Right, this is basically the point, - the VFIO container (/dev/vfio)
and the /dev/ioasid we are talking about have a core of
similarity. ioasid is the generalized, modernized, and cross-subsystem
version of the same idea. Instead of calling it "vfio container" we
call it something that evokes the idea of controlling the iommu.
The issue is to seperate /dev/vfio generic functionality from vfio and
share it with every subsystem.
It may be that /dev/vfio and /dev/ioasid end up sharing a lot of code,
with a different IOCTL interface around it. The vfio_iommu_driver_ops
is not particularly VFIOy.
Creating /dev/ioasid may primarily start as a code reorganization
exercise.
> * With a map/unmap vIOMMU (or shadow mappings), a single translation level
> is supported. With a nesting vIOMMU, we're populating the level-2
> translation (some day maybe by binding the KVM page tables, but
> currently with map/unmap ioctl).
>
> Single-level translation needs single VF per container.
Really? Why?
Jason
> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, April 7, 2021 8:21 PM
>
> On Wed, Apr 07, 2021 at 02:08:33AM +0000, Tian, Kevin wrote:
>
> > > Because if you don't then we enter insane world where a PASID is being
> > > created under /dev/ioasid but its translation path flows through setup
> > > done by VFIO and the whole user API becomes an incomprehensible
> mess.
> > >
> > > How will you even associate the PASID with the other translation??
> >
> > PASID is attached to a specific iommu domain (created by VFIO/VDPA),
> which
> > has GPA->HPA mappings already configured. If we view that mapping as an
> > attribute of the iommu domain, it's reasonable to have the userspace-
> bound
> > pgtable through /dev/ioasid to nest on it.
>
> A user controlled page table should absolutely not be an attribute of
> a hidden kernel object, nor should two parts of the kernel silently
> connect to each other via a hidden internal objects like this.
>
> Security is important - the kind of connection must use some explicit
> FD authorization to access shared objects, not be made implicit!
>
> IMHO this direction is a dead end for this reason.
>
Could you elaborate what exact security problem is brought with this
approach? Isn't ALLOW_PASID the authorization interface for the
connection?
Based on all your replies now I see what you actually want is generalizing
all IOMMU related stuff through /dev/ioasid (sort of /dev/iommu), which
requires factoring out the vfio_iommu_type1 into the general part. This is
a huge work.
Is it really the only practice in Linux that any new feature has to be
blocked as long as a refactoring work is identified? Don't people accept
any balance between enabling new features and completing refactoring
work through a staging approach, as long as we don't introduce an uAPI
specifically for the staging purpose? ☹
Thanks
Kevin
On Wed, Apr 07, 2021 at 04:36:54PM -0300, Jason Gunthorpe wrote:
> On Wed, Apr 07, 2021 at 08:43:50PM +0200, Jean-Philippe Brucker wrote:
>
> > * Get a container handle out of /dev/ioasid (or /dev/iommu, really.)
> > No operation available since we don't know what the device and IOMMU
> > capabilities are.
> >
> > * Attach the handle to a VF. With VFIO that would be
> > VFIO_GROUP_SET_CONTAINER. That causes the kernel to associate an IOMMU
> > with the handle, and decide which operations are available.
>
> Right, this is basically the point, - the VFIO container (/dev/vfio)
> and the /dev/ioasid we are talking about have a core of
> similarity. ioasid is the generalized, modernized, and cross-subsystem
> version of the same idea. Instead of calling it "vfio container" we
> call it something that evokes the idea of controlling the iommu.
>
> The issue is to seperate /dev/vfio generic functionality from vfio and
> share it with every subsystem.
>
> It may be that /dev/vfio and /dev/ioasid end up sharing a lot of code,
> with a different IOCTL interface around it. The vfio_iommu_driver_ops
> is not particularly VFIOy.
>
> Creating /dev/ioasid may primarily start as a code reorganization
> exercise.
>
> > * With a map/unmap vIOMMU (or shadow mappings), a single translation level
> > is supported. With a nesting vIOMMU, we're populating the level-2
> > translation (some day maybe by binding the KVM page tables, but
> > currently with map/unmap ioctl).
> >
> > Single-level translation needs single VF per container.
>
> Really? Why?
The vIOMMU is started in bypass, so the device can do DMA to the GPA space
until the guest configures the vIOMMU, at which point each VF is either
kept in bypass or gets new DMA mappings, which requires the host to tear
down the bypass mappings and set up the guest mappings on a per-VF basis
(I'm not considering nesting translation in the host kernel for this,
because it's not supported by all pIOMMUs and is expensive in terms of TLB
and pinned memory). So keeping a single VF per container is simpler, but
there are certainly other programming models possible.
Thanks,
Jean
On Wed, Apr 07, 2021 at 11:50:02PM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Wednesday, April 7, 2021 8:21 PM
> >
> > On Wed, Apr 07, 2021 at 02:08:33AM +0000, Tian, Kevin wrote:
> >
> > > > Because if you don't then we enter insane world where a PASID is being
> > > > created under /dev/ioasid but its translation path flows through setup
> > > > done by VFIO and the whole user API becomes an incomprehensible
> > mess.
> > > >
> > > > How will you even associate the PASID with the other translation??
> > >
> > > PASID is attached to a specific iommu domain (created by VFIO/VDPA),
> > which
> > > has GPA->HPA mappings already configured. If we view that mapping as an
> > > attribute of the iommu domain, it's reasonable to have the userspace-
> > bound
> > > pgtable through /dev/ioasid to nest on it.
> >
> > A user controlled page table should absolutely not be an attribute of
> > a hidden kernel object, nor should two parts of the kernel silently
> > connect to each other via a hidden internal objects like this.
> >
> > Security is important - the kind of connection must use some explicit
> > FD authorization to access shared objects, not be made implicit!
> >
> > IMHO this direction is a dead end for this reason.
> >
>
> Could you elaborate what exact security problem is brought with this
> approach? Isn't ALLOW_PASID the authorization interface for the
> connection?
If the kernel objects don't come out of FDs then no.
> Is it really the only practice in Linux that any new feature has to be
> blocked as long as a refactoring work is identified?
The practice is to define uAPIs that make sense and have a good chance
to be supported over a long time period, as the software evolves, not
to hacky hacky a gaint uAPI mess just to get some feature out the
door.
This proposal as it was oringial shown is exactly the kind of hacky
hacky uapi nobody wants to see. Tunneling an IOMMU uapi through a
whole bunch of other FDs is completely nutz.
Intel should basically be investing most of its time building a robust
and well designed uAPI here, and don't complain that the community is
not doing Intel's job for free.
> Don't people accept any balance between enabling new features and
> completing refactoring work through a staging approach, as long as
> we don't introduce an uAPI specifically for the staging purpose? ☹
Since this is all uapi I don't see it as applicable here.
Jason
Hi Jason,
On 4/1/21 6:03 PM, Jason Gunthorpe wrote:
> On Thu, Apr 01, 2021 at 02:08:17PM +0000, Liu, Yi L wrote:
>
>> DMA page faults are delivered to root-complex via page request message and
>> it is per-device according to PCIe spec. Page request handling flow is:
>>
>> 1) iommu driver receives a page request from device
>> 2) iommu driver parses the page request message. Get the RID,PASID, faulted
>> page and requested permissions etc.
>> 3) iommu driver triggers fault handler registered by device driver with
>> iommu_report_device_fault()
>
> This seems confused.
>
> The PASID should define how to handle the page fault, not the driver.
In my series I don't use PASID at all. I am just enabling nested stage
and the guest uses a single context. I don't allocate any user PASID at
any point.
When there is a fault at physical level (a stage 1 fault that concerns
the guest), this latter needs to be reported and injected into the
guest. The vfio pci driver registers a fault handler to the iommu layer
and in that fault handler it fills a circ bugger and triggers an eventfd
that is listened to by the VFIO-PCI QEMU device. this latter retrives
the faault from the mmapped circ buffer, it knowns which vIOMMU it is
attached to, and passes the fault to the vIOMMU.
Then the vIOMMU triggers and IRQ in the guest.
We are reusing the existing concepts from VFIO, region, IRQ to do that.
For that use case, would you also use /dev/ioasid?
Thanks
Eric
>
> I don't remember any device specific actions in ATS, so what is the
> driver supposed to do?
>
>> 4) device driver's fault handler signals an event FD to notify userspace to
>> fetch the information about the page fault. If it's VM case, inject the
>> page fault to VM and let guest to solve it.
>
> If the PASID is set to 'report page fault to userspace' then some
> event should come out of /dev/ioasid, or be reported to a linked
> eventfd, or whatever.
>
> If the PASID is set to 'SVM' then the fault should be passed to
> handle_mm_fault
>
> And so on.
>
> Userspace chooses what happens based on how they configure the PASID
> through /dev/ioasid.
>
> Why would a device driver get involved here?
>
>> Eric has sent below series for the page fault reporting for VM with passthru
>> device.
>> https://lore.kernel.org/kvm/[email protected]/
>
> It certainly should not be in vfio pci. Everything using a PASID needs
> this infrastructure, VDPA, mdev, PCI, CXL, etc.
>
> Jason
>
On Thu, Apr 15, 2021 at 03:11:19PM +0200, Auger Eric wrote:
> Hi Jason,
>
> On 4/1/21 6:03 PM, Jason Gunthorpe wrote:
> > On Thu, Apr 01, 2021 at 02:08:17PM +0000, Liu, Yi L wrote:
> >
> >> DMA page faults are delivered to root-complex via page request message and
> >> it is per-device according to PCIe spec. Page request handling flow is:
> >>
> >> 1) iommu driver receives a page request from device
> >> 2) iommu driver parses the page request message. Get the RID,PASID, faulted
> >> page and requested permissions etc.
> >> 3) iommu driver triggers fault handler registered by device driver with
> >> iommu_report_device_fault()
> >
> > This seems confused.
> >
> > The PASID should define how to handle the page fault, not the driver.
>
> In my series I don't use PASID at all. I am just enabling nested stage
> and the guest uses a single context. I don't allocate any user PASID at
> any point.
>
> When there is a fault at physical level (a stage 1 fault that concerns
> the guest), this latter needs to be reported and injected into the
> guest. The vfio pci driver registers a fault handler to the iommu layer
> and in that fault handler it fills a circ bugger and triggers an eventfd
> that is listened to by the VFIO-PCI QEMU device. this latter retrives
> the faault from the mmapped circ buffer, it knowns which vIOMMU it is
> attached to, and passes the fault to the vIOMMU.
> Then the vIOMMU triggers and IRQ in the guest.
>
> We are reusing the existing concepts from VFIO, region, IRQ to do that.
>
> For that use case, would you also use /dev/ioasid?
/dev/ioasid could do all the things you described vfio-pci as doing,
it can even do them the same way you just described.
Stated another way, do you plan to duplicate all of this code someday
for vfio-cxl? What about for vfio-platform? ARM SMMU can be hooked to
platform devices, right?
I feel what you guys are struggling with is some choice in the iommu
kernel APIs that cause the events to be delivered to the pci_device
owner, not the PASID owner.
That feels solvable.
Jason
Hi Jason,
On Thu, 15 Apr 2021 20:07:32 -0300, Jason Gunthorpe <[email protected]> wrote:
> On Thu, Apr 15, 2021 at 03:11:19PM +0200, Auger Eric wrote:
> > Hi Jason,
> >
> > On 4/1/21 6:03 PM, Jason Gunthorpe wrote:
> > > On Thu, Apr 01, 2021 at 02:08:17PM +0000, Liu, Yi L wrote:
> > >
> > >> DMA page faults are delivered to root-complex via page request
> > >> message and it is per-device according to PCIe spec. Page request
> > >> handling flow is:
> > >>
> > >> 1) iommu driver receives a page request from device
> > >> 2) iommu driver parses the page request message. Get the RID,PASID,
> > >> faulted page and requested permissions etc.
> > >> 3) iommu driver triggers fault handler registered by device driver
> > >> with iommu_report_device_fault()
> > >
> > > This seems confused.
> > >
> > > The PASID should define how to handle the page fault, not the driver.
> > >
> >
> > In my series I don't use PASID at all. I am just enabling nested stage
> > and the guest uses a single context. I don't allocate any user PASID at
> > any point.
> >
> > When there is a fault at physical level (a stage 1 fault that concerns
> > the guest), this latter needs to be reported and injected into the
> > guest. The vfio pci driver registers a fault handler to the iommu layer
> > and in that fault handler it fills a circ bugger and triggers an eventfd
> > that is listened to by the VFIO-PCI QEMU device. this latter retrives
> > the faault from the mmapped circ buffer, it knowns which vIOMMU it is
> > attached to, and passes the fault to the vIOMMU.
> > Then the vIOMMU triggers and IRQ in the guest.
> >
> > We are reusing the existing concepts from VFIO, region, IRQ to do that.
> >
> > For that use case, would you also use /dev/ioasid?
>
> /dev/ioasid could do all the things you described vfio-pci as doing,
> it can even do them the same way you just described.
>
> Stated another way, do you plan to duplicate all of this code someday
> for vfio-cxl? What about for vfio-platform? ARM SMMU can be hooked to
> platform devices, right?
>
> I feel what you guys are struggling with is some choice in the iommu
> kernel APIs that cause the events to be delivered to the pci_device
> owner, not the PASID owner.
>
> That feels solvable.
>
Perhaps more of a philosophical question for you and Alex. There is no
doubt that the direction you guided for /dev/ioasid is a much cleaner one,
especially after VDPA emerged as another IOMMU backed framework.
The question is what do we do with the nested translation features that have
been targeting the existing VFIO-IOMMU for the last three years? That
predates VDPA. Shall we put a stop marker *after* nested support and say no
more extensions for VFIO-IOMMU, new features must be built on this new
interface?
If we were to close a checkout line for some unforeseen reasons, should we
honor the customers already in line for a long time?
This is not a tactic or excuse for not working on the new /dev/ioasid
interface. In fact, I believe we can benefit from the lessons learned while
completing the existing. This will give confidence to the new
interface. Thoughts?
> Jason
Thanks,
Jacob
On Fri, Apr 16, 2021 at 03:38:02PM +0200, Auger Eric wrote:
> The redesign requirement came pretty late in the development process.
> The iommu user API is upstream for a while, the VFIO interfaces have
> been submitted a long time ago and under review for a bunch of time.
> Redesigning everything with a different API, undefined at this point, is
> a major setback for our work and will have a large impact on the
> introduction of features companies are looking forward, hence our
> frustration.
I will answer both you and Jacob at once.
This is uAPI, once it is set it can never be changed.
The kernel process and philosophy is to invest heavily in uAPI
development and review to converge on the best uAPI possible.
Many past submissions have take a long time to get this right, there
are several high profile uAPI examples.
Do you think this case is so special, or the concerns so minor, that it
should get to bypass all of the normal process?
Ask yourself, is anyone advocating for the current direction on
technical merits alone?
Certainly the patches I last saw where completely disgusting from a
uAPI design perspective.
It was against the development process to organize this work the way
it was done. Merging a wack of dead code to the kernel to support a
uAPI vision that was never clearly articulated was a big mistake.
Start from the beginning. Invest heavily in defining a high quality
uAPI. Clearly describe the uAPI to all stake holders. Break up the
implementation into patch series without dead code. Make the
patches. Remove the dead code this group has already added.
None of this should be a surprise. The VDPA discussion and related
"what is a mdev" over a year ago made it pretty clear VFIO is not the
exclusive user of "IOMMU in userspace" and that places limits on what
kind of uAPIs expansion it should experience going forward.
Jason
On Fri, Apr 16, 2021 at 04:26:19PM +0200, Auger Eric wrote:
> This was largely done during several confs including plumber, KVM forum,
> for several years. Also API docs were shared on the ML. I don't remember
> any voice was raised at those moments.
I don't think anyone objects to the high level ideas, but
implementation does matter. I don't think anyone presented "hey we
will tunnel an uAPI through VFIO to the IOMMU subsystem" - did they?
Look at the fairly simple IMS situation, for example. This was
presented at plumbers too, and the slides were great - but the
implementation was too hacky. It required a major rework of the x86
interrupt handling before it was OK.
Jason
Hi Jason,
On 4/16/21 4:34 PM, Jason Gunthorpe wrote:
> On Fri, Apr 16, 2021 at 04:26:19PM +0200, Auger Eric wrote:
>
>> This was largely done during several confs including plumber, KVM forum,
>> for several years. Also API docs were shared on the ML. I don't remember
>> any voice was raised at those moments.
>
> I don't think anyone objects to the high level ideas, but
> implementation does matter. I don't think anyone presented "hey we
> will tunnel an uAPI through VFIO to the IOMMU subsystem" - did they?
At minimum
https://events19.linuxfoundation.cn/wp-content/uploads/2017/11/Shared-Virtual-Memory-in-KVM_Yi-Liu.pdf
But most obviously everything is documented in
Documentation/userspace-api/iommu.rst where the VFIO tunneling is
clearly stated ;-)
But well let's work together to design a better and more elegant
solution then.
Thanks
Eric
>
> Look at the fairly simple IMS situation, for example. This was
> presented at plumbers too, and the slides were great - but the
> implementation was too hacky. It required a major rework of the x86
> interrupt handling before it was OK.
>
> Jason
>
Hi Jason,
On 4/16/21 1:07 AM, Jason Gunthorpe wrote:
> On Thu, Apr 15, 2021 at 03:11:19PM +0200, Auger Eric wrote:
>> Hi Jason,
>>
>> On 4/1/21 6:03 PM, Jason Gunthorpe wrote:
>>> On Thu, Apr 01, 2021 at 02:08:17PM +0000, Liu, Yi L wrote:
>>>
>>>> DMA page faults are delivered to root-complex via page request message and
>>>> it is per-device according to PCIe spec. Page request handling flow is:
>>>>
>>>> 1) iommu driver receives a page request from device
>>>> 2) iommu driver parses the page request message. Get the RID,PASID, faulted
>>>> page and requested permissions etc.
>>>> 3) iommu driver triggers fault handler registered by device driver with
>>>> iommu_report_device_fault()
>>>
>>> This seems confused.
>>>
>>> The PASID should define how to handle the page fault, not the driver.
>>
>> In my series I don't use PASID at all. I am just enabling nested stage
>> and the guest uses a single context. I don't allocate any user PASID at
>> any point.
>>
>> When there is a fault at physical level (a stage 1 fault that concerns
>> the guest), this latter needs to be reported and injected into the
>> guest. The vfio pci driver registers a fault handler to the iommu layer
>> and in that fault handler it fills a circ bugger and triggers an eventfd
>> that is listened to by the VFIO-PCI QEMU device. this latter retrives
>> the faault from the mmapped circ buffer, it knowns which vIOMMU it is
>> attached to, and passes the fault to the vIOMMU.
>> Then the vIOMMU triggers and IRQ in the guest.
>>
>> We are reusing the existing concepts from VFIO, region, IRQ to do that.
>>
>> For that use case, would you also use /dev/ioasid?
>
> /dev/ioasid could do all the things you described vfio-pci as doing,
> it can even do them the same way you just described.
>
> Stated another way, do you plan to duplicate all of this code someday
> for vfio-cxl? What about for vfio-platform? ARM SMMU can be hooked to
> platform devices, right?
vfio regions and IRQ related APIs are common user interfaces exposed by
all vfio drivers, including platform. Then the actual circular buffer
implementation details can be put in a common lib.
as for the thin vfio iommu wrappers, the ones you don't like, they are
implemented in type1 code.
Maybe the need for /dev/ioasid is more crying for PASID management but
for the nested use case, that's not obvious to me and in your different
replies, it was not crystal clear where the use case belongs to.
The redesign requirement came pretty late in the development process.
The iommu user API is upstream for a while, the VFIO interfaces have
been submitted a long time ago and under review for a bunch of time.
Redesigning everything with a different API, undefined at this point, is
a major setback for our work and will have a large impact on the
introduction of features companies are looking forward, hence our
frustration.
Thanks
Eric
>
> I feel what you guys are struggling with is some choice in the iommu
> kernel APIs that cause the events to be delivered to the pci_device
> owner, not the PASID owner.
>
> That feels solvable.
>
> Jason
>
Hi,
On 4/16/21 4:05 PM, Jason Gunthorpe wrote:
> On Fri, Apr 16, 2021 at 03:38:02PM +0200, Auger Eric wrote:
>
>> The redesign requirement came pretty late in the development process.
>> The iommu user API is upstream for a while, the VFIO interfaces have
>> been submitted a long time ago and under review for a bunch of time.
>> Redesigning everything with a different API, undefined at this point, is
>> a major setback for our work and will have a large impact on the
>> introduction of features companies are looking forward, hence our
>> frustration.
>
> I will answer both you and Jacob at once.
>
> This is uAPI, once it is set it can never be changed.
>
> The kernel process and philosophy is to invest heavily in uAPI
> development and review to converge on the best uAPI possible.
>
> Many past submissions have take a long time to get this right, there
> are several high profile uAPI examples.
>
> Do you think this case is so special, or the concerns so minor, that it
> should get to bypass all of the normal process?
That's not my intent to bypass any process. I am just trying to
understand what needs to be re-designed and for what use case.
>
> Ask yourself, is anyone advocating for the current direction on
> technical merits alone?
>
> Certainly the patches I last saw where completely disgusting from a
> uAPI design perspective.
>
> It was against the development process to organize this work the way
> it was done. Merging a wack of dead code to the kernel to support a
> uAPI vision that was never clearly articulated was a big mistake.
>
> Start from the beginning. Invest heavily in defining a high quality
> uAPI. Clearly describe the uAPI to all stake holders.
This was largely done during several confs including plumber, KVM forum,
for several years. Also API docs were shared on the ML. I don't remember
any voice was raised at those moments.
Break up the
> implementation into patch series without dead code. Make the
> patches. Remove the dead code this group has already added.
>
> None of this should be a surprise. The VDPA discussion and related
> "what is a mdev" over a year ago made it pretty clear VFIO is not the
> exclusive user of "IOMMU in userspace" and that places limits on what
> kind of uAPIs expansion it should experience going forward.
Maybe clear for you but most probably not for many other stakeholders.
Anyway I do not intend to further argue and I will be happy to learn
from you and work with you, Jacob, Liu and all other stakeholders to
define a better integration.
Thanks
Eric
>
> Jason
>
On Fri, 16 Apr 2021 06:12:58 -0700
Jacob Pan <[email protected]> wrote:
> Hi Jason,
>
> On Thu, 15 Apr 2021 20:07:32 -0300, Jason Gunthorpe <[email protected]> wrote:
>
> > On Thu, Apr 15, 2021 at 03:11:19PM +0200, Auger Eric wrote:
> > > Hi Jason,
> > >
> > > On 4/1/21 6:03 PM, Jason Gunthorpe wrote:
> > > > On Thu, Apr 01, 2021 at 02:08:17PM +0000, Liu, Yi L wrote:
> > > >
> > > >> DMA page faults are delivered to root-complex via page request
> > > >> message and it is per-device according to PCIe spec. Page request
> > > >> handling flow is:
> > > >>
> > > >> 1) iommu driver receives a page request from device
> > > >> 2) iommu driver parses the page request message. Get the RID,PASID,
> > > >> faulted page and requested permissions etc.
> > > >> 3) iommu driver triggers fault handler registered by device driver
> > > >> with iommu_report_device_fault()
> > > >
> > > > This seems confused.
> > > >
> > > > The PASID should define how to handle the page fault, not the driver.
> > > >
> > >
> > > In my series I don't use PASID at all. I am just enabling nested stage
> > > and the guest uses a single context. I don't allocate any user PASID at
> > > any point.
> > >
> > > When there is a fault at physical level (a stage 1 fault that concerns
> > > the guest), this latter needs to be reported and injected into the
> > > guest. The vfio pci driver registers a fault handler to the iommu layer
> > > and in that fault handler it fills a circ bugger and triggers an eventfd
> > > that is listened to by the VFIO-PCI QEMU device. this latter retrives
> > > the faault from the mmapped circ buffer, it knowns which vIOMMU it is
> > > attached to, and passes the fault to the vIOMMU.
> > > Then the vIOMMU triggers and IRQ in the guest.
> > >
> > > We are reusing the existing concepts from VFIO, region, IRQ to do that.
> > >
> > > For that use case, would you also use /dev/ioasid?
> >
> > /dev/ioasid could do all the things you described vfio-pci as doing,
> > it can even do them the same way you just described.
> >
> > Stated another way, do you plan to duplicate all of this code someday
> > for vfio-cxl? What about for vfio-platform? ARM SMMU can be hooked to
> > platform devices, right?
> >
> > I feel what you guys are struggling with is some choice in the iommu
> > kernel APIs that cause the events to be delivered to the pci_device
> > owner, not the PASID owner.
> >
> > That feels solvable.
> >
> Perhaps more of a philosophical question for you and Alex. There is no
> doubt that the direction you guided for /dev/ioasid is a much cleaner one,
> especially after VDPA emerged as another IOMMU backed framework.
I think this statement answers all your remaining questions ;)
> The question is what do we do with the nested translation features that have
> been targeting the existing VFIO-IOMMU for the last three years? That
> predates VDPA. Shall we put a stop marker *after* nested support and say no
> more extensions for VFIO-IOMMU, new features must be built on this new
> interface?
>
> If we were to close a checkout line for some unforeseen reasons, should we
> honor the customers already in line for a long time?
>
> This is not a tactic or excuse for not working on the new /dev/ioasid
> interface. In fact, I believe we can benefit from the lessons learned while
> completing the existing. This will give confidence to the new
> interface. Thoughts?
I understand a big part of Jason's argument is that we shouldn't be in
the habit of creating duplicate interfaces, we should create one, well
designed interfaces to share among multiple subsystems. As new users
have emerged, our solution needs to change to a common one rather than
a VFIO specific one. The IOMMU uAPI provides an abstraction, but at
the wrong level, requiring userspace interfaces for each subsystem.
Luckily the IOMMU uAPI is not really exposed as an actual uAPI, but
that changes if we proceed to enable the interfaces to tunnel it
through VFIO.
The logical answer would therefore be that we don't make that
commitment to the IOMMU uAPI if we believe now that it's fundamentally
flawed.
Ideally this new /dev/ioasid interface, and making use of it as a VFIO
IOMMU backend, should replace type1. Type1 will live on until that
interface gets to parity, at which point we may deprecate type1, but it
wouldn't make sense to continue to expand type1 in the same direction
as we intend /dev/ioasid to take over in the meantime, especially if it
means maintaining an otherwise dead uAPI. Thanks,
Alex
Hi Alex,
On Fri, 16 Apr 2021 09:45:47 -0600, Alex Williamson
<[email protected]> wrote:
> On Fri, 16 Apr 2021 06:12:58 -0700
> Jacob Pan <[email protected]> wrote:
>
> > Hi Jason,
> >
> > On Thu, 15 Apr 2021 20:07:32 -0300, Jason Gunthorpe <[email protected]>
> > wrote:
> > > On Thu, Apr 15, 2021 at 03:11:19PM +0200, Auger Eric wrote:
> > > > Hi Jason,
> > > >
> > > > On 4/1/21 6:03 PM, Jason Gunthorpe wrote:
> > > > > On Thu, Apr 01, 2021 at 02:08:17PM +0000, Liu, Yi L wrote:
> > > > >
> > > > >> DMA page faults are delivered to root-complex via page request
> > > > >> message and it is per-device according to PCIe spec. Page request
> > > > >> handling flow is:
> > > > >>
> > > > >> 1) iommu driver receives a page request from device
> > > > >> 2) iommu driver parses the page request message. Get the
> > > > >> RID,PASID, faulted page and requested permissions etc.
> > > > >> 3) iommu driver triggers fault handler registered by device
> > > > >> driver with iommu_report_device_fault()
> > > > >
> > > > > This seems confused.
> > > > >
> > > > > The PASID should define how to handle the page fault, not the
> > > > > driver.
> > > >
> > > > In my series I don't use PASID at all. I am just enabling nested
> > > > stage and the guest uses a single context. I don't allocate any
> > > > user PASID at any point.
> > > >
> > > > When there is a fault at physical level (a stage 1 fault that
> > > > concerns the guest), this latter needs to be reported and injected
> > > > into the guest. The vfio pci driver registers a fault handler to
> > > > the iommu layer and in that fault handler it fills a circ bugger
> > > > and triggers an eventfd that is listened to by the VFIO-PCI QEMU
> > > > device. this latter retrives the faault from the mmapped circ
> > > > buffer, it knowns which vIOMMU it is attached to, and passes the
> > > > fault to the vIOMMU. Then the vIOMMU triggers and IRQ in the guest.
> > > >
> > > > We are reusing the existing concepts from VFIO, region, IRQ to do
> > > > that.
> > > >
> > > > For that use case, would you also use /dev/ioasid?
> > >
> > > /dev/ioasid could do all the things you described vfio-pci as doing,
> > > it can even do them the same way you just described.
> > >
> > > Stated another way, do you plan to duplicate all of this code someday
> > > for vfio-cxl? What about for vfio-platform? ARM SMMU can be hooked to
> > > platform devices, right?
> > >
> > > I feel what you guys are struggling with is some choice in the iommu
> > > kernel APIs that cause the events to be delivered to the pci_device
> > > owner, not the PASID owner.
> > >
> > > That feels solvable.
> > >
> > Perhaps more of a philosophical question for you and Alex. There is no
> > doubt that the direction you guided for /dev/ioasid is a much cleaner
> > one, especially after VDPA emerged as another IOMMU backed framework.
>
> I think this statement answers all your remaining questions ;)
>
> > The question is what do we do with the nested translation features that
> > have been targeting the existing VFIO-IOMMU for the last three years?
> > That predates VDPA. Shall we put a stop marker *after* nested support
> > and say no more extensions for VFIO-IOMMU, new features must be built
> > on this new interface?
> >
> > If we were to close a checkout line for some unforeseen reasons, should
> > we honor the customers already in line for a long time?
> >
> > This is not a tactic or excuse for not working on the new /dev/ioasid
> > interface. In fact, I believe we can benefit from the lessons learned
> > while completing the existing. This will give confidence to the new
> > interface. Thoughts?
>
> I understand a big part of Jason's argument is that we shouldn't be in
> the habit of creating duplicate interfaces, we should create one, well
> designed interfaces to share among multiple subsystems. As new users
> have emerged, our solution needs to change to a common one rather than
> a VFIO specific one. The IOMMU uAPI provides an abstraction, but at
> the wrong level, requiring userspace interfaces for each subsystem.
>
> Luckily the IOMMU uAPI is not really exposed as an actual uAPI, but
> that changes if we proceed to enable the interfaces to tunnel it
> through VFIO.
>
> The logical answer would therefore be that we don't make that
> commitment to the IOMMU uAPI if we believe now that it's fundamentally
> flawed.
>
I agree the uAPI data tunneling is definitely flawed in terms of
scalability.
I was just thinking it is still a small part of the overall
picture. Considering there are other parts such as fault reporting, user
space deployment, performance, and security. By completing the support on
the existing VFIO framework, it would at least offer a clear landscape where
the new /dev/ioasid can improve upon.
Perhaps similar to cgroup v1 vs v2, it took a long time and with known
limitations in v1.
Anyway, I am glad we have a clear direction now.
Thanks,
Jacob
> Ideally this new /dev/ioasid interface, and making use of it as a VFIO
> IOMMU backend, should replace type1. Type1 will live on until that
> interface gets to parity, at which point we may deprecate type1, but it
> wouldn't make sense to continue to expand type1 in the same direction
> as we intend /dev/ioasid to take over in the meantime, especially if it
> means maintaining an otherwise dead uAPI. Thanks,
>
> Alex
>
Thanks,
Jacob
On Fri, Apr 16, 2021 at 10:23:32AM -0700, Jacob Pan wrote:
> Perhaps similar to cgroup v1 vs v2, it took a long time and with known
> limitations in v1.
cgroup v2 is still having transition problems, if anything it is a
cautionary tale to think really hard about uAPI because transitioning
can be really hard.
It might be very wise to make /dev/ioasid and /dev/vfio ioctl
compatible in some way so existing software has a smoother upgrade
path.
For instance by defining a default IOASID
Jason
Hi Alex,
> From: Alex Williamson <[email protected]>
> Sent: Friday, April 16, 2021 11:46 PM
[...]
> > This is not a tactic or excuse for not working on the new /dev/ioasid
> > interface. In fact, I believe we can benefit from the lessons learned
> > while completing the existing. This will give confidence to the new
> > interface. Thoughts?
>
> I understand a big part of Jason's argument is that we shouldn't be in
> the habit of creating duplicate interfaces, we should create one, well
> designed interfaces to share among multiple subsystems. As new users
> have emerged, our solution needs to change to a common one rather than
> a VFIO specific one. The IOMMU uAPI provides an abstraction, but at
> the wrong level, requiring userspace interfaces for each subsystem.
>
> Luckily the IOMMU uAPI is not really exposed as an actual uAPI, but
> that changes if we proceed to enable the interfaces to tunnel it
> through VFIO.
>
> The logical answer would therefore be that we don't make that
> commitment to the IOMMU uAPI if we believe now that it's fundamentally
> flawed.
>
> Ideally this new /dev/ioasid interface, and making use of it as a VFIO
> IOMMU backend, should replace type1.
yeah, just a double check, I think this also requires a new set of uAPIs
(e.g. new MAP/UNMAP), which means the current VFIO IOMMU type1 related ioctls
would be deprecated in future. right?
> Type1 will live on until that
> interface gets to parity, at which point we may deprecate type1, but it
> wouldn't make sense to continue to expand type1 in the same direction
> as we intend /dev/ioasid to take over in the meantime, especially if it
> means maintaining an otherwise dead uAPI. Thanks,
understood.
Regards,
Yi Liu
On Wed, Apr 21, 2021 at 01:18:07PM +0000, Liu, Yi L wrote:
> > Ideally this new /dev/ioasid interface, and making use of it as a VFIO
> > IOMMU backend, should replace type1.
>
> yeah, just a double check, I think this also requires a new set of uAPIs
> (e.g. new MAP/UNMAP), which means the current VFIO IOMMU type1 related ioctls
> would be deprecated in future. right?
This is something to think about, it might make sense to run the
current ioctls in some "compat" mode under /dev/ioasid just to make
migration easier
In this sense /dev/ioasid would be a container that holds multiple
IOASIDs and every new format ioctl specifies the IOASID to operate
on. The legacy ioctls would use some default IOASID but otherwise act
the same.
I'm assuming here there is nothing especially wrong with the /dev/vfio
interface beyond being in the wrong place in the kernel and not
supporting multiple IOASIDs?
Then there may be a fairly simple approch to just make /dev/vfio ==
/dev/ioasid, at least for type 1.
By this I mean we could have the new /dev/ioasid code take over the
/dev/vfio char dev and present both interfaces, but with the same
fops.
The VFIO code would have to remain somehow to support PPC until
someone from ppc world migrates the SPAPR_TCE to use the kernel's new
common IOMMU framework instead of the arch specialty thing it does
now. But it can at least be compile disabled on everything except ppc.
Jason
On Wed, 21 Apr 2021 13:23:07 -0300
Jason Gunthorpe <[email protected]> wrote:
> On Wed, Apr 21, 2021 at 01:18:07PM +0000, Liu, Yi L wrote:
> > > Ideally this new /dev/ioasid interface, and making use of it as a VFIO
> > > IOMMU backend, should replace type1.
> >
> > yeah, just a double check, I think this also requires a new set of uAPIs
> > (e.g. new MAP/UNMAP), which means the current VFIO IOMMU type1 related ioctls
> > would be deprecated in future. right?
>
> This is something to think about, it might make sense to run the
> current ioctls in some "compat" mode under /dev/ioasid just to make
> migration easier
Right, deprecating type1 doesn't necessarily mean deprecating the uAPI.
We created a type1v2 with minor semantic differences in unmap behavior
within the same uAPI. Userspace is able to query and select an IOMMU
backend model and each model might have a different uAPI. The SPAPR
IOMMU backend already takes advantage of this, using some ioctls
consistent with type1, but also requiring some extra steps.
Also note that the simple MAP and UNMAP uAPI of type1 has its
limitations, which we already struggle with. See for example the
massive performance issues backing a vIOMMU with this uAPI. The
/dev/ioasid approach should alleviate some of that, using a page table
for the 1st level, but a more advanced uAPI for the 2nd level seems
necessary at some point as well.
> In this sense /dev/ioasid would be a container that holds multiple
> IOASIDs and every new format ioctl specifies the IOASID to operate
> on. The legacy ioctls would use some default IOASID but otherwise act
> the same.
>
> I'm assuming here there is nothing especially wrong with the /dev/vfio
> interface beyond being in the wrong place in the kernel and not
> supporting multiple IOASIDs?
>
> Then there may be a fairly simple approch to just make /dev/vfio ==
> /dev/ioasid, at least for type 1.
>
> By this I mean we could have the new /dev/ioasid code take over the
> /dev/vfio char dev and present both interfaces, but with the same
> fops.
That's essentially replacing vfio-core, where I think we're more
talking about /dev/ioasid being an available IOMMU backend driver which
a user can select when available. The interface of making that
selection might change to accept an external /dev/ioasid file
descriptor, of course. Maybe you can elaborate on how the vfio device
and group uAPI live (or not) in this new scheme were /dev/ioasid is the
primary interface. Thanks,
Alex
On Wed, Apr 21, 2021 at 10:54:51AM -0600, Alex Williamson wrote:
> That's essentially replacing vfio-core, where I think we're more
I am only talking about /dev/vfio here which is basically the IOMMU
interface part.
I still expect that VFIO_GROUP_SET_CONTAINER will be used to connect
/dev/{ioasid,vfio} to the VFIO group and all the group and device
logic stays inside VFIO.
The appeal of unifying /dev/{ioasid,vfio} to a single fops is that it
cleans up vfio a lot - we don't have to have two different code paths
where one handles a vfio_container and the other a ioasid_container
and the all the related different iommu ops and so on.
Everything can be switched to ioasid_container all down the line. If
it wasn't for PPC this looks fairly simple.
Since getting rid of PPC looks a bit hard, we'd be stuck with
accepting a /dev/ioasid and then immediately wrappering it in a
vfio_container an shimming it through a vfio_iommu_ops. It is not
ideal at all, but in my look around I don't see a major problem if
type1 implementation is moved to live under /dev/ioasid.
For concreteness if we look at the set container flow with ioasid I'd
say something like:
vfio_group_fops_unl_ioctl()
VFIO_GROUP_SET_CONTAINER
vfio_group_set_container()
if (f.file->f_op == &vfio_fops) {
// Use a real vfio_container and vfio_iommu_driver
driver->ops->attach_group()
tce_iommu_attach_group()
}
if (ioasid_container = ioasid_get_from_fd(container_fd)) {
// create a dummy vfio_container and use the ioasid driver
container = kzalloc()
container->iommu_driver = ioasid_shim
driver->ops->attach_group()
ioasid_shim_attach_group(ioasid_container, ...)
ioasid_attach_group()
// What used to be vfio_iommu_attach_group()
Broadly all the ops vfio need go through the ioasid_shim which relays
them to the generic ioasid API.
We end up with a ioasid.h that basically has the vfio_iommu_type1 code
lightly recast into some 'struct iommu_container' and a set of
ioasid_* function entry points that follow vfio_iommu_driver_ops_type1:
ioasid_attach_group
ioasid_detatch_group
ioasid_<something about user pages>
ioasid_read/ioasid_write
If we have this, and /dev/ioasid implements the legacy IOCTLs, then
/dev/vfio == /dev/ioasid and we can compile out vfio_fops and related
from vfio.c and tell ioasid.c to create /dev/vfio instead using the
ops it owns.
This is a very long winded way of saying ideally we'd do
approximately:
git mv drivers/vfio/vfio_iommu_type1.c drivers/ioasid/ioasid.c
As the first step. Essentially we declare that what is type1 is really
the user interface to the internal kernel IOMMU kAPI, which has been
steadily evolving since type1 was created 10 years ago.
> The interface of making that selection might change to accept an
> external /dev/ioasid file descriptor, of course. Maybe you can
> elaborate on how the vfio device and group uAPI live (or not) in
> this new scheme were /dev/ioasid is the primary interface. Thanks,
They say in vfio. You'd still open a group and you'd still pass in
either /dev/vfio or /dev/ioasid to define the container
Though, completely as an unrelated aside, I admit to not entirely
understanding why the group is the central element of the uAPI.
It is weird that the vfio "drivers" all work on the struct vfio_device
(at least after my series), and it has a file_operations presence via
vfio_device_fops, but instead of struct vfio_device directly having a
'struct device' and cdev to access the FD we get it through a group FD
and agroup chardev via VFIO_GROUP_GET_DEVICE_FD
If we were to revise this, and I don't see a huge reason to do so, I
would put a struct device and cdev in struct vfio_device, attach the
vfio_device directly to the ioasid and then forget about the group, at
least as uapi, completely.
Or at least I don't see where that gets into trouble, but I'm not too
familiar with the multi-vfio in a process scenario..
Jason
On Wed, 21 Apr 2021 14:52:03 -0300
Jason Gunthorpe <[email protected]> wrote:
> On Wed, Apr 21, 2021 at 10:54:51AM -0600, Alex Williamson wrote:
>
> > That's essentially replacing vfio-core, where I think we're more
>
> I am only talking about /dev/vfio here which is basically the IOMMU
> interface part.
>
> I still expect that VFIO_GROUP_SET_CONTAINER will be used to connect
> /dev/{ioasid,vfio} to the VFIO group and all the group and device
> logic stays inside VFIO.
But that group and device logic is also tied to the container, where
the IOMMU backend is the interchangeable thing that provides the IOMMU
manipulation for that container. If you're using
VFIO_GROUP_SET_CONTAINER to associate a group to a /dev/ioasid, then
you're really either taking that group outside of vfio or you're
re-implementing group management in /dev/ioasid. I'd expect the
transition point at VFIO_SET_IOMMU.
> The appeal of unifying /dev/{ioasid,vfio} to a single fops is that it
> cleans up vfio a lot - we don't have to have two different code paths
> where one handles a vfio_container and the other a ioasid_container
> and the all the related different iommu ops and so on.
Currently vfio IOMMU backends don't know about containers either.
Setting the vfio IOMMU for a container creates an object within the
IOMMU backend representing that IOMMU context. IOMMU groups are then
attached to that context, where the IOMMU backend can add to or create a
new IOMMU domain to include that group, or if no compatible IOMMU
context can be created, reject it.
> Everything can be switched to ioasid_container all down the line. If
> it wasn't for PPC this looks fairly simple.
At what point is it no longer vfio? I'd venture to say that replacing
the container rather than invoking a different IOMMU backend is that
point.
> Since getting rid of PPC looks a bit hard, we'd be stuck with
> accepting a /dev/ioasid and then immediately wrappering it in a
> vfio_container an shimming it through a vfio_iommu_ops. It is not
> ideal at all, but in my look around I don't see a major problem if
> type1 implementation is moved to live under /dev/ioasid.
But type1 is \just\ an IOMMU backend, not "/dev/vfio". Given that
nobody flinched at removing NVLink support, maybe just deprecate SPAPR
now and see if anyone objects ;)
> For concreteness if we look at the set container flow with ioasid I'd
> say something like:
>
> vfio_group_fops_unl_ioctl()
> VFIO_GROUP_SET_CONTAINER
> vfio_group_set_container()
> if (f.file->f_op == &vfio_fops) {
> // Use a real vfio_container and vfio_iommu_driver
> driver->ops->attach_group()
> tce_iommu_attach_group()
> }
>
> if (ioasid_container = ioasid_get_from_fd(container_fd)) {
> // create a dummy vfio_container and use the ioasid driver
> container = kzalloc()
> container->iommu_driver = ioasid_shim
> driver->ops->attach_group()
> ioasid_shim_attach_group(ioasid_container, ...)
> ioasid_attach_group()
> // What used to be vfio_iommu_attach_group()
How do you handle multiple groups with the same container? Again, I'd
expect some augmentation of VFIO_SET_IOMMU so that /dev/vfio continues
to exist and manage group to container mapping and /dev/ioasid manages
the IOMMU context of that container.
>
> Broadly all the ops vfio need go through the ioasid_shim which relays
> them to the generic ioasid API.
/dev/vfio essentially already passes through all fops to the IOMMU
backend once the VFIO_SET_IOMMU is established.
> We end up with a ioasid.h that basically has the vfio_iommu_type1 code
> lightly recast into some 'struct iommu_container' and a set of
> ioasid_* function entry points that follow vfio_iommu_driver_ops_type1:
> ioasid_attach_group
> ioasid_detatch_group
> ioasid_<something about user pages>
> ioasid_read/ioasid_write
Again, this looks like a vfio IOMMU backend. What are we accomplishing
by replacing /dev/vfio with /dev/ioasid versus some manipulation of
VFIO_SET_IOMMU accepting a /dev/ioasid fd?
> If we have this, and /dev/ioasid implements the legacy IOCTLs, then
> /dev/vfio == /dev/ioasid and we can compile out vfio_fops and related
> from vfio.c and tell ioasid.c to create /dev/vfio instead using the
> ops it owns.
Why would we want /dev/ioasid to implement legacy ioctls instead of
simply implementing an interface to allow /dev/ioasid to be used as a
vfio IOMMU backend?
> This is a very long winded way of saying ideally we'd do
> approximately:
> git mv drivers/vfio/vfio_iommu_type1.c drivers/ioasid/ioasid.c
>
> As the first step. Essentially we declare that what is type1 is really
> the user interface to the internal kernel IOMMU kAPI, which has been
> steadily evolving since type1 was created 10 years ago.
The pseudo code above really suggests you do want to remove
/dev/vfio/vfio, but this is only one of the IOMMU backends for vfio, so
I can't quite figure out if we're talking past each other.
As I expressed in another thread, type1 has a lot of shortcomings. The
mapping interface leaves userspace trying desperately to use statically
mapped buffers because the map/unmap latency is too high. We have
horrible issues with duplicate locked page accounting across
containers. It suffers pretty hard from feature creep in various
areas. A new IOMMU backend is an opportunity to redesign some of these
things.
> > The interface of making that selection might change to accept an
> > external /dev/ioasid file descriptor, of course. Maybe you can
> > elaborate on how the vfio device and group uAPI live (or not) in
> > this new scheme were /dev/ioasid is the primary interface. Thanks,
>
> They say in vfio. You'd still open a group and you'd still pass in
> either /dev/vfio or /dev/ioasid to define the container
>
> Though, completely as an unrelated aside, I admit to not entirely
> understanding why the group is the central element of the uAPI.
>
> It is weird that the vfio "drivers" all work on the struct vfio_device
> (at least after my series), and it has a file_operations presence via
> vfio_device_fops, but instead of struct vfio_device directly having a
> 'struct device' and cdev to access the FD we get it through a group FD
> and agroup chardev via VFIO_GROUP_GET_DEVICE_FD
>
> If we were to revise this, and I don't see a huge reason to do so, I
> would put a struct device and cdev in struct vfio_device, attach the
> vfio_device directly to the ioasid and then forget about the group, at
> least as uapi, completely.
>
> Or at least I don't see where that gets into trouble, but I'm not too
> familiar with the multi-vfio in a process scenario..
The vfio_group is the unit of userspace ownership as it reflects the
IOMMU group as the unit of isolation. Ideally there's a 1:1 mapping
between device and group, but that is of course not always the case.
The IOMMU group also abstracts isolation and visibility relative to
DMA. For example, in a PCIe topology a multi-function device may not
have isolation between functions, but each requester ID is visible to
the IOMMU. This lacks isolation but not IOMMU granularity, or
visibility. A conventional PCI topology however lacks both isolation
and visibility, all devices downstream use either the PCIe-to-PCI
bridge RID or a RID derived from the secondary bus. We can also have
mixed topologies, for example PCIe-to-PCI<->PCI-to-PCIe, where the
grouping code needs to search upstream for the highest level where we
achieve both isolation and visibility.
To simplify this, we use the group as the unit of IOMMU context, again
favoring singleton group behavior.
An individual vfio_device doesn't know about these isolation
dependencies, thus while a vfio bus/device driver like vfio-pci can
expose a device, it's vfio-core than manages whether the isolated set
of devices which includes that device, ie. the group, meets the
requirements for userspace access. Thanks,
Alex
On Wed, Apr 21, 2021 at 01:33:12PM -0600, Alex Williamson wrote:
> > I still expect that VFIO_GROUP_SET_CONTAINER will be used to connect
> > /dev/{ioasid,vfio} to the VFIO group and all the group and device
> > logic stays inside VFIO.
>
> But that group and device logic is also tied to the container, where
> the IOMMU backend is the interchangeable thing that provides the IOMMU
> manipulation for that container.
I think that is an area where the discussion would need to be focused.
I don't feel very prepared to have it in details, as I haven't dug
into all the group and iommu micro-operation very much.
But, it does seem like the security concept that VFIO is creating with
the group also has to be present in the lower iommu layer too.
With different subsystems joining devices to the same ioasid's we
still have to enforce the security propery the vfio group is creating.
> If you're using VFIO_GROUP_SET_CONTAINER to associate a group to a
> /dev/ioasid, then you're really either taking that group outside of
> vfio or you're re-implementing group management in /dev/ioasid.
This sounds right.
> > Everything can be switched to ioasid_container all down the line. If
> > it wasn't for PPC this looks fairly simple.
>
> At what point is it no longer vfio? I'd venture to say that replacing
> the container rather than invoking a different IOMMU backend is that
> point.
sorry, which is no longer vfio?
> > Since getting rid of PPC looks a bit hard, we'd be stuck with
> > accepting a /dev/ioasid and then immediately wrappering it in a
> > vfio_container an shimming it through a vfio_iommu_ops. It is not
> > ideal at all, but in my look around I don't see a major problem if
> > type1 implementation is moved to live under /dev/ioasid.
>
> But type1 is \just\ an IOMMU backend, not "/dev/vfio". Given that
> nobody flinched at removing NVLink support, maybe just deprecate SPAPR
> now and see if anyone objects ;)
Would simplify this project, but I wonder :)
In any event, it does look like today we'd expect the SPAPR stuff
would be done through the normal iommu APIs, perhaps enhanced a bit,
which makes me suspect an enhanced type1 can implement SPAPR.
I say this because the SPAPR looks quite a lot like PASID when it has
APIs for allocating multiple tables and other things. I would be
interested to hear someone from IBM talk about what it is doing and
how it doesn't fit into today's IOMMU API.
It is very old and the iommu world has advanced tremendously lately,
maybe I'm too optimisitic?
> > We end up with a ioasid.h that basically has the vfio_iommu_type1 code
> > lightly recast into some 'struct iommu_container' and a set of
> > ioasid_* function entry points that follow vfio_iommu_driver_ops_type1:
> > ioasid_attach_group
> > ioasid_detatch_group
> > ioasid_<something about user pages>
> > ioasid_read/ioasid_write
>
> Again, this looks like a vfio IOMMU backend. What are we accomplishing
> by replacing /dev/vfio with /dev/ioasid versus some manipulation of
> VFIO_SET_IOMMU accepting a /dev/ioasid fd?
The point of all of this is to make the user api for the IOMMU
cross-subsystem. It is not a vfio IOMMU backend, it is moving the
IOMMU abstraction from VFIO into the iommu framework and giving the
iommu framework a re-usable user API.
My ideal outcome would be for VFIO to use only the new iommu/ioasid
API and have no iommu pluggability at all. The iommu subsystem
provides everything needed to VFIO, and provides it equally to VDPA
and everything else.
drivers/vfio/ becomes primarily about 'struct vfio_device' and
everything related to its IOCTL interface.
drivers/iommu and ioasid.c become all about a pluggable IOMMU
interface, including a uAPI for it.
IMHO it makes a high level sense, though it may be a pipe dream.
> > If we have this, and /dev/ioasid implements the legacy IOCTLs, then
> > /dev/vfio == /dev/ioasid and we can compile out vfio_fops and related
> > from vfio.c and tell ioasid.c to create /dev/vfio instead using the
> > ops it owns.
>
> Why would we want /dev/ioasid to implement legacy ioctls instead of
> simply implementing an interface to allow /dev/ioasid to be used as a
> vfio IOMMU backend?
Only to make our own migration easier. I'd imagine everyone would want
to sit down and design this new clear ioasid API that can co-exist on
/dev/ioasid with the legacy once.
> The pseudo code above really suggests you do want to remove
> /dev/vfio/vfio, but this is only one of the IOMMU backends for vfio, so
> I can't quite figure out if we're talking past each other.
I'm not quite sure what you mean by "one of the IOMMU backends?" You
mean type1, right?
> As I expressed in another thread, type1 has a lot of shortcomings. The
> mapping interface leaves userspace trying desperately to use statically
> mapped buffers because the map/unmap latency is too high. We have
> horrible issues with duplicate locked page accounting across
> containers. It suffers pretty hard from feature creep in various
> areas. A new IOMMU backend is an opportunity to redesign some of these
> things.
Sure, but also those kinds of transformational things go alot better
if you can smoothly go from the old to the new and have technical
co-existance in side the kernel. Having a shim that maps the old APIs
to new APIs internally to Linux helps keep the implementation from
becoming too bogged down with compatibility.
> The IOMMU group also abstracts isolation and visibility relative to
> DMA. For example, in a PCIe topology a multi-function device may not
> have isolation between functions, but each requester ID is visible to
> the IOMMU.
Okay, I'm glad I have this all right in my head, as I was pretty sure
this was what the group was about.
My next question is why do we have three things as a FD: group, device
and container (aka IOMMU interface)?
Do we have container because the /dev/vfio/vfio can hold only a single
page table so we need to swap containers sometimes?
If we start from a clean sheet and make a sketch..
/dev/ioasid is the IOMMU control interface. It can create multiple
IOASIDs that have page tables and it can manipulate those page tables.
Each IOASID is identified by some number.
struct vfio_device/vdpa_device/etc are consumers of /dev/ioasid
When a device attaches to an ioasid userspace gives VFIO/VDPA the
ioasid FD and the ioasid # in the FD.
The security rule for isolation is that once a device is attached to a
/dev/ioasid fd then all other devices in that security group must be
attached to the same ioasid FD or left unused.
Thus /dev/ioasid also becomes the unit of security and the IOMMU
subsystem level becomes aware of and enforces the group security
rules. Userspace does not need to "see" the group
In sketch it would be like
ioasid_fd = open("/dev/ioasid");
vfio_device_fd = open("/dev/vfio/device0")
vdpa_device_fd = open("/dev/vdpa/device0")
ioctl(vifo_device_fd, JOIN_IOASID_FD, ioasifd)
ioctl(vdpa_device_fd, JOIN_IOASID_FD, ioasifd)
gpa_ioasid_id = ioctl(ioasid_fd, CREATE_IOASID, ..)
ioctl(ioasid_fd, SET_IOASID_PAGE_TABLES, ..)
ioctl(vfio_device, ATTACH_IOASID, gpa_ioasid_id)
ioctl(vpda_device, ATTACH_IOASID, gpa_ioasid_id)
.. both VDPA and VFIO see the guest physical map and the kernel has
enough info that both could use the same IOMMU page table
structure ..
// Guest viommu turns off bypass mode for the vfio device
ioctl(vfio_device, DETATCH_IOASID)
// Guest viommu creates a new page table
rid_ioasid_id = ioctl(ioasid_fd, CREATE_IOASID, ..)
ioctl(ioasid_fd, SET_IOASID_PAGE_TABLES, ..)
// Guest viommu links the new page table to the RID
ioctl(vfio_device, ATTACH_IOASID, rid_ioasid_id)
The group security concept becomes implicit and hidden from the
uAPI. JOIN_IOASID_FD implicitly finds the device's group inside the
kernel and requires that all members of the group be joined only to
this ioasid_fd.
Essentially we discover the group from the device instead of the
device from the group.
Where does it fall down compared to the three FD version we have
today?
Jason
> From: Jason Gunthorpe <[email protected]>
> Sent: Thursday, April 22, 2021 7:03 AM
>
> > The pseudo code above really suggests you do want to remove
> > /dev/vfio/vfio, but this is only one of the IOMMU backends for vfio, so
> > I can't quite figure out if we're talking past each other.
>
> I'm not quite sure what you mean by "one of the IOMMU backends?" You
> mean type1, right?
I think Alex meant that type1 is one of the IOMMU backends in VFIO (type1,
type1v2, tce, tce_v2, noiommu, etc.) which are all configured through
/dev/vfio/vfio. If we are just moving type1 to /dev/ioasid, the justification
is not sufficient by replacing /dev/vfio/vfio with /dev/ioasid, at least in
this transition phase (before all iommu bits are consolidated in /dev/ioasid
in your ideal outcome).
>
> > As I expressed in another thread, type1 has a lot of shortcomings. The
> > mapping interface leaves userspace trying desperately to use statically
> > mapped buffers because the map/unmap latency is too high. We have
> > horrible issues with duplicate locked page accounting across
> > containers. It suffers pretty hard from feature creep in various
> > areas. A new IOMMU backend is an opportunity to redesign some of these
> > things.
>
> Sure, but also those kinds of transformational things go alot better
> if you can smoothly go from the old to the new and have technical
> co-existance in side the kernel. Having a shim that maps the old APIs
> to new APIs internally to Linux helps keep the implementation from
> becoming too bogged down with compatibility.
The shim layer could be considered as a new iommu backend in VFIO,
which connects VFIO iommu ops to the internal helpers in drivers/ioasid.
In this case then we don't need to replicate the VFIO uAPI through
/dev/ioasid. Instead the new interface just supports new uAPI. An old
VFIO userspace still opens /dev/vfio/vfio to conduct iommu operations
which implicitly goes to drivers/ioasid. A new VFIO userspace uses
/dev/vfio/vfio to join ioasid_fd and then use new uAPIs through /dev/
ioasid to manage iommu pgtables, as you described below.
>
> > The IOMMU group also abstracts isolation and visibility relative to
> > DMA. For example, in a PCIe topology a multi-function device may not
> > have isolation between functions, but each requester ID is visible to
> > the IOMMU.
>
> Okay, I'm glad I have this all right in my head, as I was pretty sure
> this was what the group was about.
>
> My next question is why do we have three things as a FD: group, device
> and container (aka IOMMU interface)?
>
> Do we have container because the /dev/vfio/vfio can hold only a single
> page table so we need to swap containers sometimes?
Yes, one container can hold only a single page table. When vIOMMU is
exposed, VFIO requires each device/group in different containers to
support per-device address space (before nested translation is supported),
which is switched between GPA and gIOVA when bypass mode is turned
on/off for a given device.
Another tricky thing is that a container may be linked to multiple iommu
domains in VFIO, as devices in the container may locate behind different
IOMMUs with inconsistent capability (commit 1ef3e2bc). In this case
more accurately one container can hold a single address space, which could
be replayed into multiple page tables (with exact same mappings). I'm not
sure whether this is something that could be simplified (or not supported)
in the new interface. In the end each pgtable operation is per iommu domain
in the iommu layer. I wonder where we want to maintain the relationship
between the ioasid_fd and associated iommu domains.
>
> If we start from a clean sheet and make a sketch..
>
> /dev/ioasid is the IOMMU control interface. It can create multiple
> IOASIDs that have page tables and it can manipulate those page tables.
> Each IOASID is identified by some number.
>
> struct vfio_device/vdpa_device/etc are consumers of /dev/ioasid
>
> When a device attaches to an ioasid userspace gives VFIO/VDPA the
> ioasid FD and the ioasid # in the FD.
>
> The security rule for isolation is that once a device is attached to a
> /dev/ioasid fd then all other devices in that security group must be
> attached to the same ioasid FD or left unused.
>
> Thus /dev/ioasid also becomes the unit of security and the IOMMU
> subsystem level becomes aware of and enforces the group security
> rules. Userspace does not need to "see" the group
>
> In sketch it would be like
> ioasid_fd = open("/dev/ioasid");
> vfio_device_fd = open("/dev/vfio/device0")
> vdpa_device_fd = open("/dev/vdpa/device0")
> ioctl(vifo_device_fd, JOIN_IOASID_FD, ioasifd)
> ioctl(vdpa_device_fd, JOIN_IOASID_FD, ioasifd)
>
> gpa_ioasid_id = ioctl(ioasid_fd, CREATE_IOASID, ..)
> ioctl(ioasid_fd, SET_IOASID_PAGE_TABLES, ..)
>
> ioctl(vfio_device, ATTACH_IOASID, gpa_ioasid_id)
> ioctl(vpda_device, ATTACH_IOASID, gpa_ioasid_id)
>
> .. both VDPA and VFIO see the guest physical map and the kernel has
> enough info that both could use the same IOMMU page table
> structure ..
>
> // Guest viommu turns off bypass mode for the vfio device
> ioctl(vfio_device, DETATCH_IOASID)
>
> // Guest viommu creates a new page table
> rid_ioasid_id = ioctl(ioasid_fd, CREATE_IOASID, ..)
> ioctl(ioasid_fd, SET_IOASID_PAGE_TABLES, ..)
>
> // Guest viommu links the new page table to the RID
> ioctl(vfio_device, ATTACH_IOASID, rid_ioasid_id)
Just to confirm. Above flow is for current map/unmap flavor as what
VFIO/vDPA do today. Later when nested translation is supported,
there is no need to detach gpa_ioasid_fd. Instead, a new cmd will
be introduced to nest rid_ioasid_fd on top of gpa_ioasid_fd:
ioctl(ioasid_fd, NEST_IOASIDS, rid_ioasid_id, gpa_ioasid_id);
ioctl(ioasid_fd, BIND_PGTABLE, rid_ioasid_id, ...);
and vSVA will follow the same flow:
ioctl(ioasid_fd, NEST_IOASIDS, sva_ioasid_id, gpa_ioasid_id);
ioctl(ioasid_fd, BIND_PGTABLE, sva_ioasid_id, ...);
Does it match your mind when expanding /dev/ioasid to support
vSVA and other new usages?
>
> The group security concept becomes implicit and hidden from the
> uAPI. JOIN_IOASID_FD implicitly finds the device's group inside the
> kernel and requires that all members of the group be joined only to
> this ioasid_fd.
>
> Essentially we discover the group from the device instead of the
> device from the group.
>
> Where does it fall down compared to the three FD version we have
> today?
>
I also feel hiding group from uAPI is a good thing and is interested in
the rationale behind for explicitly managing group in vfio (which is
essentially the same boundary as provided by iommu group), e.g. for
better user experience when group security is broken?
Thanks
Kevin
On Thu, Apr 22, 2021 at 08:34:32AM +0000, Tian, Kevin wrote:
> The shim layer could be considered as a new iommu backend in VFIO,
> which connects VFIO iommu ops to the internal helpers in
> drivers/ioasid.
It may be the best we can do because of SPAPR, but the ideal outcome
should be to remove the entire pluggable IOMMU stuff from vfio
entirely and have it only use /dev/ioasid
We should never add another pluggable IOMMU type to vfio - everything
should be done through drives/iommu now that it is much more capable.
> Another tricky thing is that a container may be linked to multiple iommu
> domains in VFIO, as devices in the container may locate behind different
> IOMMUs with inconsistent capability (commit 1ef3e2bc).
Frankly this sounds over complicated. I would think /dev/ioasid should
select the IOMMU when the first device is joined, and all future joins
must be compatible with the original IOMMU - ie there is only one set
of IOMMU capabilities in a /dev/ioasid.
This means qemue might have multiple /dev/ioasid's if the system has
multiple incompatible IOMMUs (is this actually a thing?) The platform
should design its IOMMU domains to minimize the number of
/dev/ioasid's required.
Is there a reason we need to share IOASID'd between completely
divergance IOMMU implementations? I don't expect the HW should be able
to physically share page tables??
That decision point alone might be the thing that just says we can't
ever have /dev/vfio/vfio == /dev/ioasid
> Just to confirm. Above flow is for current map/unmap flavor as what
> VFIO/vDPA do today. Later when nested translation is supported,
> there is no need to detach gpa_ioasid_fd. Instead, a new cmd will
> be introduced to nest rid_ioasid_fd on top of gpa_ioasid_fd:
Sure.. The tricky bit will be to define both of the common nested
operating modes.
nested_ioasid = ioctl(ioasid_fd, CREATE_NESTED_IOASID, gpa_ioasid_id);
ioctl(ioasid_fd, SET_NESTED_IOASID_PAGE_TABLES, nested_ioasid, ..)
// IOMMU will match on the device RID, no PASID:
ioctl(vfio_device, ATTACH_IOASID, nested_ioasid);
// IOMMU will match on the device RID and PASID:
ioctl(vfio_device, ATTACH_IOASID_PASID, pasid, nested_ioasid);
Notice that ATTACH (or bind, whatever) is always done on the
vfio_device FD. ATTACH tells the IOMMU HW to link the PCI BDF&PASID to
a specific page table defined by an IOASID.
I expect we have many flavours of IOASID tables, eg we have normal,
and 'nested with table controlled by hypervisor'. ARM has 'nested with
table controlled by guest' right? So like this?
nested_ioasid = ioctl(ioasid_fd, CREATE_DELGATED_IOASID,
gpa_ioasid_id, <some kind of viommu_id>)
// PASID now goes to <viommu_id>
ioctl(vfio_device, ATTACH_IOASID_PASID, pasid, nested_ioasid);
Where <viommu_id> is some internal to the guest handle of the viommu
page table scoped within gpa_ioasid_id? Like maybe it is GPA of the
base of the page table?
The guest can't select its own PASIDs without telling the hypervisor,
right?
> I also feel hiding group from uAPI is a good thing and is interested in
> the rationale behind for explicitly managing group in vfio (which is
> essentially the same boundary as provided by iommu group), e.g. for
> better user experience when group security is broken?
Indeed, I can see how things might have just evolved into this, but if
it has a purpose it seems pretty hidden.
we need it or not seems pretty hidden.
Jason
On Wed, 21 Apr 2021 13:33:12 -0600, Alex Williamson wrote:
> On Wed, 21 Apr 2021 14:52:03 -0300
> Jason Gunthorpe <[email protected]> wrote:
>
> > On Wed, Apr 21, 2021 at 10:54:51AM -0600, Alex Williamson wrote:
> >
> > > That's essentially replacing vfio-core, where I think we're more
> >
> > I am only talking about /dev/vfio here which is basically the IOMMU
> > interface part.
> >
> > I still expect that VFIO_GROUP_SET_CONTAINER will be used to connect
> > /dev/{ioasid,vfio} to the VFIO group and all the group and device
> > logic stays inside VFIO.
>
> But that group and device logic is also tied to the container, where
> the IOMMU backend is the interchangeable thing that provides the IOMMU
> manipulation for that container. If you're using
> VFIO_GROUP_SET_CONTAINER to associate a group to a /dev/ioasid, then
> you're really either taking that group outside of vfio or you're
> re-implementing group management in /dev/ioasid. I'd expect the
> transition point at VFIO_SET_IOMMU.
per my understanding, transiting at the VFIO_SET_IOMMU point makes more
sense as VFIO can still have the group and device logic, which is the key
concept of group granularity isolation for userspace direct access.
--
Regards,
Yi Liu
On Wed, 21 Apr 2021 20:03:01 -0300
Jason Gunthorpe <[email protected]> wrote:
> On Wed, Apr 21, 2021 at 01:33:12PM -0600, Alex Williamson wrote:
>
> > > I still expect that VFIO_GROUP_SET_CONTAINER will be used to connect
> > > /dev/{ioasid,vfio} to the VFIO group and all the group and device
> > > logic stays inside VFIO.
> >
> > But that group and device logic is also tied to the container, where
> > the IOMMU backend is the interchangeable thing that provides the IOMMU
> > manipulation for that container.
>
> I think that is an area where the discussion would need to be focused.
>
> I don't feel very prepared to have it in details, as I haven't dug
> into all the group and iommu micro-operation very much.
>
> But, it does seem like the security concept that VFIO is creating with
> the group also has to be present in the lower iommu layer too.
>
> With different subsystems joining devices to the same ioasid's we
> still have to enforce the security propery the vfio group is creating.
>
> > If you're using VFIO_GROUP_SET_CONTAINER to associate a group to a
> > /dev/ioasid, then you're really either taking that group outside of
> > vfio or you're re-implementing group management in /dev/ioasid.
>
> This sounds right.
>
> > > Everything can be switched to ioasid_container all down the line. If
> > > it wasn't for PPC this looks fairly simple.
> >
> > At what point is it no longer vfio? I'd venture to say that replacing
> > the container rather than invoking a different IOMMU backend is that
> > point.
>
> sorry, which is no longer vfio?
I'm suggesting that if we're replacing the container/group model with
an ioasid then we're effectively creating a new thing that really only
retains the vfio device uapi.
> > > Since getting rid of PPC looks a bit hard, we'd be stuck with
> > > accepting a /dev/ioasid and then immediately wrappering it in a
> > > vfio_container an shimming it through a vfio_iommu_ops. It is not
> > > ideal at all, but in my look around I don't see a major problem if
> > > type1 implementation is moved to live under /dev/ioasid.
> >
> > But type1 is \just\ an IOMMU backend, not "/dev/vfio". Given that
> > nobody flinched at removing NVLink support, maybe just deprecate SPAPR
> > now and see if anyone objects ;)
>
> Would simplify this project, but I wonder :)
>
> In any event, it does look like today we'd expect the SPAPR stuff
> would be done through the normal iommu APIs, perhaps enhanced a bit,
> which makes me suspect an enhanced type1 can implement SPAPR.
David Gibson has argued for some time that SPAPR could be handled via a
converged type1 model. We has mapped that out at one point,
essentially a "type2", but neither of us had any bandwidth to pursue it.
> I say this because the SPAPR looks quite a lot like PASID when it has
> APIs for allocating multiple tables and other things. I would be
> interested to hear someone from IBM talk about what it is doing and
> how it doesn't fit into today's IOMMU API.
[Cc David, Alexey]
> It is very old and the iommu world has advanced tremendously lately,
> maybe I'm too optimisitic?
>
> > > We end up with a ioasid.h that basically has the vfio_iommu_type1 code
> > > lightly recast into some 'struct iommu_container' and a set of
> > > ioasid_* function entry points that follow vfio_iommu_driver_ops_type1:
> > > ioasid_attach_group
> > > ioasid_detatch_group
> > > ioasid_<something about user pages>
> > > ioasid_read/ioasid_write
> >
> > Again, this looks like a vfio IOMMU backend. What are we accomplishing
> > by replacing /dev/vfio with /dev/ioasid versus some manipulation of
> > VFIO_SET_IOMMU accepting a /dev/ioasid fd?
>
> The point of all of this is to make the user api for the IOMMU
> cross-subsystem. It is not a vfio IOMMU backend, it is moving the
> IOMMU abstraction from VFIO into the iommu framework and giving the
> iommu framework a re-usable user API.
Right, but I don't see that implies it cannot work within the vfio
IOMMU model. Currently when an IOMMU is set, the /dev/vfio/vfio
container becomes a conduit for file ops from the container to be
forwarded to the IOMMU. But that's in part because the user doesn't
have another object to interact with the IOMMU. It's entirely possible
that with an ioasid shim, the user would continue to interact directly
with the /dev/ioasid fd for IOMMU manipulation and only use
VFIO_SET_IOMMU to associate a vfio container to that ioasid.
> My ideal outcome would be for VFIO to use only the new iommu/ioasid
> API and have no iommu pluggability at all. The iommu subsystem
> provides everything needed to VFIO, and provides it equally to VDPA
> and everything else.
As above, we don't necessarily need to have the vfio container be the
access mechanism for the IOMMU, it can become just an means to
association the container with an IOMMU. This has quite a few
transitional benefits.
> drivers/vfio/ becomes primarily about 'struct vfio_device' and
> everything related to its IOCTL interface.
>
> drivers/iommu and ioasid.c become all about a pluggable IOMMU
> interface, including a uAPI for it.
>
> IMHO it makes a high level sense, though it may be a pipe dream.
This is where we've dissolved all but the vfio device uapi, which
suggests the group and container model were never necessary and I'm not
sure exactly what that uapi looks like. We currently make use of an
IOMMU api that is group aware, but that awareness extends out to the
vfio uapi.
> > > If we have this, and /dev/ioasid implements the legacy IOCTLs, then
> > > /dev/vfio == /dev/ioasid and we can compile out vfio_fops and related
> > > from vfio.c and tell ioasid.c to create /dev/vfio instead using the
> > > ops it owns.
> >
> > Why would we want /dev/ioasid to implement legacy ioctls instead of
> > simply implementing an interface to allow /dev/ioasid to be used as a
> > vfio IOMMU backend?
>
> Only to make our own migration easier. I'd imagine everyone would want
> to sit down and design this new clear ioasid API that can co-exist on
> /dev/ioasid with the legacy once.
vfio really just wants to be able to attach groups to an address space
to consider them isolated, everything else about the IOMMU API could
happen via a new ioasid file descriptor representing that context, ie.
vfio handles the group ownership and device access, ioasid handles the
actual mappings.
> > The pseudo code above really suggests you do want to remove
> > /dev/vfio/vfio, but this is only one of the IOMMU backends for vfio, so
> > I can't quite figure out if we're talking past each other.
>
> I'm not quite sure what you mean by "one of the IOMMU backends?" You
> mean type1, right?
>
> > As I expressed in another thread, type1 has a lot of shortcomings. The
> > mapping interface leaves userspace trying desperately to use statically
> > mapped buffers because the map/unmap latency is too high. We have
> > horrible issues with duplicate locked page accounting across
> > containers. It suffers pretty hard from feature creep in various
> > areas. A new IOMMU backend is an opportunity to redesign some of these
> > things.
>
> Sure, but also those kinds of transformational things go alot better
> if you can smoothly go from the old to the new and have technical
> co-existance in side the kernel. Having a shim that maps the old APIs
> to new APIs internally to Linux helps keep the implementation from
> becoming too bogged down with compatibility.
I'm afraid /dev/ioasid providing type1 compatibility would be just that.
> > The IOMMU group also abstracts isolation and visibility relative to
> > DMA. For example, in a PCIe topology a multi-function device may not
> > have isolation between functions, but each requester ID is visible to
> > the IOMMU.
>
> Okay, I'm glad I have this all right in my head, as I was pretty sure
> this was what the group was about.
>
> My next question is why do we have three things as a FD: group, device
> and container (aka IOMMU interface)?
>
> Do we have container because the /dev/vfio/vfio can hold only a single
> page table so we need to swap containers sometimes?
The container represents an IOMMU address space, which can be shared by
multiple groups, where each group may contain one or more devices.
Swapping a container would require releasing all the devices (the user
cannot have access to a non-isolated device), then a group could be
moved from one container to another.
> If we start from a clean sheet and make a sketch..
>
> /dev/ioasid is the IOMMU control interface. It can create multiple
> IOASIDs that have page tables and it can manipulate those page tables.
> Each IOASID is identified by some number.
>
> struct vfio_device/vdpa_device/etc are consumers of /dev/ioasid
>
> When a device attaches to an ioasid userspace gives VFIO/VDPA the
> ioasid FD and the ioasid # in the FD.
>
> The security rule for isolation is that once a device is attached to a
> /dev/ioasid fd then all other devices in that security group must be
> attached to the same ioasid FD or left unused.
Sounds like a group... Note also that if those other devices are not
isolated from the user's device, the user could manipulate "unused"
devices via DMA. So even unused devices should be within the same
IOMMU context... thus attaching groups to IOMMU domains.
> Thus /dev/ioasid also becomes the unit of security and the IOMMU
> subsystem level becomes aware of and enforces the group security
> rules. Userspace does not need to "see" the group
What tools does userspace have to understand isolation of individual
devices without groups?
> In sketch it would be like
> ioasid_fd = open("/dev/ioasid");
> vfio_device_fd = open("/dev/vfio/device0")
> vdpa_device_fd = open("/dev/vdpa/device0")
> ioctl(vifo_device_fd, JOIN_IOASID_FD, ioasifd)
> ioctl(vdpa_device_fd, JOIN_IOASID_FD, ioasifd)
>
> gpa_ioasid_id = ioctl(ioasid_fd, CREATE_IOASID, ..)
> ioctl(ioasid_fd, SET_IOASID_PAGE_TABLES, ..)
>
> ioctl(vfio_device, ATTACH_IOASID, gpa_ioasid_id)
> ioctl(vpda_device, ATTACH_IOASID, gpa_ioasid_id)
>
> .. both VDPA and VFIO see the guest physical map and the kernel has
> enough info that both could use the same IOMMU page table
> structure ..
>
> // Guest viommu turns off bypass mode for the vfio device
> ioctl(vfio_device, DETATCH_IOASID)
>
> // Guest viommu creates a new page table
> rid_ioasid_id = ioctl(ioasid_fd, CREATE_IOASID, ..)
> ioctl(ioasid_fd, SET_IOASID_PAGE_TABLES, ..)
>
> // Guest viommu links the new page table to the RID
> ioctl(vfio_device, ATTACH_IOASID, rid_ioasid_id)
>
> The group security concept becomes implicit and hidden from the
> uAPI. JOIN_IOASID_FD implicitly finds the device's group inside the
> kernel and requires that all members of the group be joined only to
> this ioasid_fd.
>
> Essentially we discover the group from the device instead of the
> device from the group.
>
> Where does it fall down compared to the three FD version we have
> today?
The group concept is explicit today because how does userspace learn
about implicit dependencies between devices? For example, if the user
has a conventional PCI bus with a couple devices on it, how do they
understand that those devices cannot be assigned to separate userspace
drivers? The group fd fills that gap. Thanks,
Alex
On Thu, Apr 22, 2021 at 11:13:37AM -0600, Alex Williamson wrote:
> I'm suggesting that if we're replacing the container/group model with
> an ioasid then we're effectively creating a new thing that really only
> retains the vfio device uapi.
Yes, I think that is a fair assessment, but not necessarily bad.
The VFIO device uAPI is really the thing that is unique to VFIO and
cannot be re-used anyplace else, in my assesment this is what vfio
*is*, and the series I've been working on make it more obvious how
broad that statement really is.
> > In any event, it does look like today we'd expect the SPAPR stuff
> > would be done through the normal iommu APIs, perhaps enhanced a bit,
> > which makes me suspect an enhanced type1 can implement SPAPR.
>
> David Gibson has argued for some time that SPAPR could be handled via a
> converged type1 model. We has mapped that out at one point,
> essentially a "type2", but neither of us had any bandwidth to pursue it.
Cool! Well, let's put a pin in it, I don't think revising SPAPR should
be a pre-condition for anything here - but we can all agree than an
ideal world would be able to access SPAPR functionality from
devices/iommu and /dev/ioasid
And it would be nice to map this out enough to provide enough
preperation in the new /dev/ioasid uAPI. For instance I saw the only
SPAPR specific stuff in DPDK was to preset the IOVA range that the
IOASID would support. This would be trivial to add and may have
benifits to other IOMMUS by reducing the number of translation levels
or somethign.
> Right, but I don't see that implies it cannot work within the vfio
> IOMMU model. Currently when an IOMMU is set, the /dev/vfio/vfio
> container becomes a conduit for file ops from the container to be
> forwarded to the IOMMU. But that's in part because the user doesn't
> have another object to interact with the IOMMU. It's entirely possible
> that with an ioasid shim, the user would continue to interact directly
> with the /dev/ioasid fd for IOMMU manipulation and only use
> VFIO_SET_IOMMU to associate a vfio container to that ioasid.
I am looking at this in two directions, the first is if we have
/dev/ioasid how do we connect it to VFIO? And here I aruge we need new
device IOCTLs and ideally a VFIO world that does not have a vestigial
container FD at all.
This is because /dev/ioasid will have to be multi-IOASID and it just
does not fit well into the VFIO IOMMU pluggable model at all - or at
least trying to make it fit will defeat the point of having it in the
first place.
This does not seem to be a big deal - the required device IOCTLs
should be small and having two code paths isn't going to be an
exploding complexity.
The second direction is how do we keep /dev/vfio/vfio entire uAPI
without duplicating a lot of code. There is where building a ioasid
back end or making ioasid == vfio are areas to look at.
> vfio really just wants to be able to attach groups to an address space
> to consider them isolated, everything else about the IOMMU API could
> happen via a new ioasid file descriptor representing that context, ie.
> vfio handles the group ownership and device access, ioasid handles the
> actual mappings.
Right, exactly.
> > Do we have container because the /dev/vfio/vfio can hold only a single
> > page table so we need to swap containers sometimes?
>
> The container represents an IOMMU address space, which can be shared by
> multiple groups, where each group may contain one or more devices.
> Swapping a container would require releasing all the devices (the user
> cannot have access to a non-isolated device), then a group could be
> moved from one container to another.
So, basically, the answer is yes.
Having the container FD hold a single IOASID forced the group FD to
exist because we can't maintain the security object of a group in the
container FD if the work flow is to swap the container FD.
Here what I suggest is to merge the group security and the multiple
"IOMMU address space" concept into one FD. The /dev/ioasid would have
multiple page tables objects within it called IOASID'd and each IOASID
effectively represents what /dev/vfio/vfio does today.
We can assign any device joined to a /dev/ioasid FD to any IOASID inside
that FD, dynamically.
> > The security rule for isolation is that once a device is attached to a
> > /dev/ioasid fd then all other devices in that security group must be
> > attached to the same ioasid FD or left unused.
>
> Sounds like a group... Note also that if those other devices are not
> isolated from the user's device, the user could manipulate "unused"
> devices via DMA. So even unused devices should be within the same
> IOMMU context... thus attaching groups to IOMMU domains.
That is a very interesting point. So, say, in the classic PCI bus
world if I have a NIC and HD on my PCI bus and both are in the group,
I assign the NIC to a /dev/ioasid & VFIO then it is possible to use
the NIC to access the HD via DMA
And here you want a more explicit statement that the HD is at risk by
using the NIC?
Honestly, I'm not sure the current group FD is actually showing that
very strongly - though I get the point it is modeled in the sysfs and
kind of implicit in the API - we evolved things in a way where most
actual applications are taking in a PCI BDF from the user, not a group
reference. So the actual security impact seems lost on the user.
Along my sketch if we have:
ioctl(vifo_device_fd, JOIN_IOASID_FD, ioasifd)
[..]
ioctl(vfio_device, ATTACH_IOASID, gpa_ioasid_id) == ENOPERM
I would feel comfortable if the ATTACH_IOASID fails by default if all
devices in the group have not been joined to the same ioasidfd.
So in the NIC&HD example the application would need to do:
ioasid_fd = open("/dev/ioasid");
nic_device_fd = open("/dev/vfio/device0")
hd_device_fd = open("/dev/vfio/device1")
ioctl(nic_device_fd, JOIN_IOASID_FD, ioasifd)
ioctl(hd_device_fd, JOIN_IOASID_FD, ioasifd)
[..]
ioctl(nice_device, ATTACH_IOASID, gpa_ioasid_id) == SUCCESS
Now the security relation is forced by the kernel to be very explicit.
However to keep current semantics, I'd suggest a flag on
JOIN_IOASID_FD called "IOASID_IMPLICIT_GROUP" which has the effect of
allowing the ATTACH_IOASID to succeed without the user having to
explicitly join all the group devices. This is analogous to the world
we have today of opening the VFIO group FD but only instantiating one
device FD.
In effect the ioasid FD becomes the group and the numbered IOASID's
inside the FD become the /dev/vfio/vfio objects - we don't end up with
fewer objects in the system, they just have different uAPI
presentations.
I'd envision applications like DPDK that are BDF centric to use the
first API with some '--allow-insecure-vfio' flag to switch on the
IOASID_IMPLICIT_GROUP. Maybe good applications would also print:
"Danger Will Robinson these PCI BDFs [...] are also at risk"
When the switch is used by parsing the sysfs
> > Thus /dev/ioasid also becomes the unit of security and the IOMMU
> > subsystem level becomes aware of and enforces the group security
> > rules. Userspace does not need to "see" the group
>
> What tools does userspace have to understand isolation of individual
> devices without groups?
I think we can continue to show all of this group information in sysfs
files, it just doesn't require application code to open a group FD.
This becomes relavent the more I think about it - elmininating the
group and container FD uAPI by directly creating the device FD also
sidesteps questions about how to model these objects in a /dev/ioasid
only world. We simply don't have them at all so the answer is pretty
easy.
Jason
On Thu, 22 Apr 2021 14:57:15 -0300
Jason Gunthorpe <[email protected]> wrote:
> > > The security rule for isolation is that once a device is attached to a
> > > /dev/ioasid fd then all other devices in that security group must be
> > > attached to the same ioasid FD or left unused.
> >
> > Sounds like a group... Note also that if those other devices are not
> > isolated from the user's device, the user could manipulate "unused"
> > devices via DMA. So even unused devices should be within the same
> > IOMMU context... thus attaching groups to IOMMU domains.
>
> That is a very interesting point. So, say, in the classic PCI bus
> world if I have a NIC and HD on my PCI bus and both are in the group,
> I assign the NIC to a /dev/ioasid & VFIO then it is possible to use
> the NIC to access the HD via DMA
>
> And here you want a more explicit statement that the HD is at risk by
> using the NIC?
If by "classic" you mean conventional PCI bus, then this is much worse
than simply "at risk". The IOMMU cannot differentiate devices behind a
PCIe-to-PCI bridge, so the moment you turn on the IOMMU context for the
NIC, the address space for your HBA is pulled out from under it. In
the vfio world, the NIC and HBA are grouped and managed together, the
user cannot change the IOMMU context of a group unless all of the
devices in the group are "viable", ie. they are released from any host
drivers.
> Honestly, I'm not sure the current group FD is actually showing that
> very strongly - though I get the point it is modeled in the sysfs and
> kind of implicit in the API - we evolved things in a way where most
> actual applications are taking in a PCI BDF from the user, not a group
> reference. So the actual security impact seems lost on the user.
vfio users are extremely aware of grouping, they understand the model,
if not always the reason for the grouping. You only need to look at
r/VFIO to find various lsgroup scripts and kernel patches to manipulate
grouping. The visibility to the user is valuable imo.
> Along my sketch if we have:
>
> ioctl(vifo_device_fd, JOIN_IOASID_FD, ioasifd)
> [..]
> ioctl(vfio_device, ATTACH_IOASID, gpa_ioasid_id) == ENOPERM
>
> I would feel comfortable if the ATTACH_IOASID fails by default if all
> devices in the group have not been joined to the same ioasidfd.
And without a group representation to userspace, how would a user know
to resolve that?
> So in the NIC&HD example the application would need to do:
>
> ioasid_fd = open("/dev/ioasid");
> nic_device_fd = open("/dev/vfio/device0")
> hd_device_fd = open("/dev/vfio/device1")
>
> ioctl(nic_device_fd, JOIN_IOASID_FD, ioasifd)
> ioctl(hd_device_fd, JOIN_IOASID_FD, ioasifd)
> [..]
> ioctl(nice_device, ATTACH_IOASID, gpa_ioasid_id) == SUCCESS
>
> Now the security relation is forced by the kernel to be very explicit.
But not discoverable to the user.
> However to keep current semantics, I'd suggest a flag on
> JOIN_IOASID_FD called "IOASID_IMPLICIT_GROUP" which has the effect of
> allowing the ATTACH_IOASID to succeed without the user having to
> explicitly join all the group devices. This is analogous to the world
> we have today of opening the VFIO group FD but only instantiating one
> device FD.
>
> In effect the ioasid FD becomes the group and the numbered IOASID's
> inside the FD become the /dev/vfio/vfio objects - we don't end up with
> fewer objects in the system, they just have different uAPI
> presentations.
>
> I'd envision applications like DPDK that are BDF centric to use the
> first API with some '--allow-insecure-vfio' flag to switch on the
> IOASID_IMPLICIT_GROUP. Maybe good applications would also print:
> "Danger Will Robinson these PCI BDFs [...] are also at risk"
> When the switch is used by parsing the sysfs
So the group still exist in sysfs, they just don't have vfio
representations? An implicit grouping does what, automatically unbind
the devices, so an admin gives a user access to the NIC but their HBA
device disappears because they were implicitly linked? That's why vfio
basis ownership on the group, if a user owns the group but the group is
not viable because a device is still bound to another kernel driver,
the use can't do anything. Implicitly snarfing up subtly affected
devices is bad.
> > > Thus /dev/ioasid also becomes the unit of security and the IOMMU
> > > subsystem level becomes aware of and enforces the group security
> > > rules. Userspace does not need to "see" the group
> >
> > What tools does userspace have to understand isolation of individual
> > devices without groups?
>
> I think we can continue to show all of this group information in sysfs
> files, it just doesn't require application code to open a group FD.
>
> This becomes relavent the more I think about it - elmininating the
> group and container FD uAPI by directly creating the device FD also
> sidesteps questions about how to model these objects in a /dev/ioasid
> only world. We simply don't have them at all so the answer is pretty
> easy.
I'm not sold. Ideally each device would be fully isolated, then we
could assume a 1:1 relation of group and device and collapse the model
to work on devices. We don't live in that world and I see a benefit to
making that explicit in the uapi, even if that group fd might seem
superfluous at times. Thanks,
Alex
On Thu, Apr 22, 2021 at 01:37:47PM -0600, Alex Williamson wrote:
> If by "classic" you mean conventional PCI bus, then this is much worse
> than simply "at risk". The IOMMU cannot differentiate devices behind a
> PCIe-to-PCI bridge, so the moment you turn on the IOMMU context for the
> NIC, the address space for your HBA is pulled out from under it.
Yes, I understand this, but this is fine and not really surprising if
the HD device is just forced to remain "unusued"
To my mind the bigger issue is the NIC now has access to the HD and
nobody really raised an alarm unless the HD happened to have a kernel
driver bound.
> the vfio world, the NIC and HBA are grouped and managed together, the
> user cannot change the IOMMU context of a group unless all of the
> devices in the group are "viable", ie. they are released from any host
> drivers.
Yes, I don't propose to change any of that, I just suggest to make the
'change the IOMMU context" into "join a /dev/ioasid fd"
All devices in the group have to be joined to the same ioasid or, with
the flag, left "unused" with no kernel driver.
This is the same viability test VFIO is doing now, just moved slightly
in the programming flow.
> vfio users are extremely aware of grouping, they understand the model,
> if not always the reason for the grouping. You only need to look at
> r/VFIO to find various lsgroup scripts and kernel patches to manipulate
> grouping. The visibility to the user is valuable imo.
I don't propose to remove visibility, sysfs and the lsgroup scripts
should all still be there.
I'm just acknowledging reality that the user command line experiance
we have is focused on single BDFs not on groups. The user only sees
the group idea when things explode, so why do we have it as such an
integral part of the programming model?
> > ioctl(vifo_device_fd, JOIN_IOASID_FD, ioasifd)
> > [..]
> > ioctl(vfio_device, ATTACH_IOASID, gpa_ioasid_id) == ENOPERM
> >
> > I would feel comfortable if the ATTACH_IOASID fails by default if all
> > devices in the group have not been joined to the same ioasidfd.
>
> And without a group representation to userspace, how would a user know
> to resolve that?
Userspace can continue to read sysfs files that show the group
relation.
I'm only talking about the group char device and FD.
> So the group still exist in sysfs, they just don't have vfio
> representations? An implicit grouping does what, automatically unbind
> the devices, so an admin gives a user access to the NIC but their HBA
> device disappears because they were implicitly linked?
It does exactly the same thing as opening the VFIO group FD and
instantiating a single device FD does today.
Most software, like dpdk, automatically deduces the VFIO group from
the commandline BDF, I'm mainly suggesting we move that deduction from
userspace software to kernel software.
> basis ownership on the group, if a user owns the group but the group is
> not viable because a device is still bound to another kernel driver,
> the use can't do anything. Implicitly snarfing up subtly affected
> devices is bad.
The user would get an /dev/ioasid join failure just like they get a
failure from VFIO_GROUP_SET_CONTAINER (?) today that reflects the
group is not viable.
Otherwise what is the alternative?
How do we model the VFIO group security concept to something like
VDPA?
How do you reconcile the ioasid security model with the VFIO container
and group FDs?
Jason
On Thu, 22 Apr 2021 17:00:24 -0300
Jason Gunthorpe <[email protected]> wrote:
> On Thu, Apr 22, 2021 at 01:37:47PM -0600, Alex Williamson wrote:
>
> > If by "classic" you mean conventional PCI bus, then this is much worse
> > than simply "at risk". The IOMMU cannot differentiate devices behind a
> > PCIe-to-PCI bridge, so the moment you turn on the IOMMU context for the
> > NIC, the address space for your HBA is pulled out from under it.
>
> Yes, I understand this, but this is fine and not really surprising if
> the HD device is just forced to remain "unusued"
>
> To my mind the bigger issue is the NIC now has access to the HD and
> nobody really raised an alarm unless the HD happened to have a kernel
> driver bound.
>
> > the vfio world, the NIC and HBA are grouped and managed together, the
> > user cannot change the IOMMU context of a group unless all of the
> > devices in the group are "viable", ie. they are released from any host
> > drivers.
>
> Yes, I don't propose to change any of that, I just suggest to make the
> 'change the IOMMU context" into "join a /dev/ioasid fd"
>
> All devices in the group have to be joined to the same ioasid or, with
> the flag, left "unused" with no kernel driver.
>
> This is the same viability test VFIO is doing now, just moved slightly
> in the programming flow.
>
> > vfio users are extremely aware of grouping, they understand the model,
> > if not always the reason for the grouping. You only need to look at
> > r/VFIO to find various lsgroup scripts and kernel patches to manipulate
> > grouping. The visibility to the user is valuable imo.
>
> I don't propose to remove visibility, sysfs and the lsgroup scripts
> should all still be there.
>
> I'm just acknowledging reality that the user command line experiance
> we have is focused on single BDFs not on groups. The user only sees
> the group idea when things explode, so why do we have it as such an
> integral part of the programming model?
Because it's fundamental to the isolation of the device? What you're
proposing doesn't get around the group issue, it just makes it implicit
rather than explicit in the uapi. For what? Some ideal notion that
every device should be isolated at the expense of userspace drivers
that then fail randomly because they didn't take into account groups
because it's not part of the uapi?
> > > ioctl(vifo_device_fd, JOIN_IOASID_FD, ioasifd)
> > > [..]
> > > ioctl(vfio_device, ATTACH_IOASID, gpa_ioasid_id) == ENOPERM
> > >
> > > I would feel comfortable if the ATTACH_IOASID fails by default if all
> > > devices in the group have not been joined to the same ioasidfd.
> >
> > And without a group representation to userspace, how would a user know
> > to resolve that?
>
> Userspace can continue to read sysfs files that show the group
> relation.
>
> I'm only talking about the group char device and FD.
>
> > So the group still exist in sysfs, they just don't have vfio
> > representations? An implicit grouping does what, automatically unbind
> > the devices, so an admin gives a user access to the NIC but their HBA
> > device disappears because they were implicitly linked?
>
> It does exactly the same thing as opening the VFIO group FD and
> instantiating a single device FD does today.
>
> Most software, like dpdk, automatically deduces the VFIO group from
> the commandline BDF, I'm mainly suggesting we move that deduction from
> userspace software to kernel software.
>
> > basis ownership on the group, if a user owns the group but the group is
> > not viable because a device is still bound to another kernel driver,
> > the use can't do anything. Implicitly snarfing up subtly affected
> > devices is bad.
>
> The user would get an /dev/ioasid join failure just like they get a
> failure from VFIO_GROUP_SET_CONTAINER (?) today that reflects the
> group is not viable.
>
> Otherwise what is the alternative?
>
> How do we model the VFIO group security concept to something like
> VDPA?
Is it really a "VFIO group security concept"? We're reflecting the
reality of the hardware, not all devices are fully isolated. An IOMMU
group is the smallest set of devices we believe are isolated from all
other sets of devices. VFIO groups simply reflect that notion of an
IOMMU group. This is the reality that any userspace driver needs to
play in, it doesn't magically go away because we drop the group file
descriptor. It only makes the uapi more difficult to use correctly
because userspace drivers need to go outside of the uapi to have any
idea that this restriction exists. Thanks,
Alex
On Thu, Apr 22, 2021 at 04:38:08PM -0600, Alex Williamson wrote:
> Because it's fundamental to the isolation of the device? What you're
> proposing doesn't get around the group issue, it just makes it implicit
> rather than explicit in the uapi.
I'm not even sure it makes it explicit or implicit, it just takes away
the FD.
There are four group IOCTLs, I see them mapping to /dev/ioasid follows:
VFIO_GROUP_GET_STATUS -
+ VFIO_GROUP_FLAGS_CONTAINER_SET is fairly redundant
+ VFIO_GROUP_FLAGS_VIABLE could be in a new sysfs under
kernel/iomm_groups, or could be an IOCTL on /dev/ioasid
IOASID_ALL_DEVICES_VIABLE
VFIO_GROUP_SET_CONTAINER -
+ This happens implicitly when the device joins the IOASID
so it gets moved to the vfio_device FD:
ioctl(vifo_device_fd, JOIN_IOASID_FD, ioasifd)
VFIO_GROUP_UNSET_CONTAINER -
+ Also moved to the vfio_device FD, opposite of JOIN_IOASID_FD
VFIO_GROUP_GET_DEVICE_FD -
+ Replaced by opening /dev/vfio/deviceX
Learn the deviceX which will be the cdev sysfs shows as:
/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/vfio/deviceX/dev
Open /dev/vfio/deviceX
> > How do we model the VFIO group security concept to something like
> > VDPA?
>
> Is it really a "VFIO group security concept"? We're reflecting the
> reality of the hardware, not all devices are fully isolated.
Well, exactly.
/dev/ioasid should understand the group concept somehow, otherwise it
is incomplete and maybe even security broken.
So, how do I add groups to, say, VDPA in a way that makes sense? The
only answer I come to is broadly what I outlined here - make
/dev/ioasid do all the group operations, and do them when we enjoin
the VDPA device to the ioasid.
Once I have solved all the groups problems with the non-VFIO users,
then where does that leave VFIO? Why does VFIO need a group FD if
everyone else doesn't?
> IOMMU group. This is the reality that any userspace driver needs to
> play in, it doesn't magically go away because we drop the group file
> descriptor.
I'm not saying it does, I'm saying it makes the uAPI more regular and
easier to fit into /dev/ioasid without the group FD.
> It only makes the uapi more difficult to use correctly because
> userspace drivers need to go outside of the uapi to have any idea
> that this restriction exists.
I don't think it makes any substantive difference one way or the
other.
With the group FD: the userspace has to read sysfs, find the list of
devices in the group, open the group fd, create device FDs for each
device using the name from sysfs.
Starting from a BDF the general pseudo code is
group_path = readlink("/sys/bus/pci/devices/BDF/iommu_group")
group_name = basename(group_path)
group_fd = open("/dev/vfio/"+group_name)
device_fd = ioctl(VFIO_GROUP_GET_DEVICE_FD, BDF);
Without the group FD: the userspace has to read sysfs, find the list
of devices in the group and then open the device-specific cdev (found
via sysfs) and link them to a /dev/ioasid FD.
Starting from a BDF the general pseudo code is:
device_name = first_directory_of("/sys/bus/pci/devices/BDF/vfio/")
device_fd = open("/dev/vfio/"+device_name)
ioasidfd = open("/dev/ioasid")
ioctl(device_fd, JOIN_IOASID_FD, ioasidfd)
These two routes can have identical outcomes and identical security
checks.
In both cases if userspace wants a list of BDFs in the same group as
the BDF it is interested in:
readdir("/sys/bus/pci/devices/BDF/iommu_group/devices")
It seems like a very small difference to me.
I still don't see how the group restriction gets surfaced to the
application through the group FD. The applications I looked through
just treat the group FD as a step on their way to get the device_fd.
Jason
> From: Jason Gunthorpe <[email protected]>
> Sent: Thursday, April 22, 2021 8:10 PM
>
> On Thu, Apr 22, 2021 at 08:34:32AM +0000, Tian, Kevin wrote:
>
> > Another tricky thing is that a container may be linked to multiple iommu
> > domains in VFIO, as devices in the container may locate behind different
> > IOMMUs with inconsistent capability (commit 1ef3e2bc).
>
> Frankly this sounds over complicated. I would think /dev/ioasid should
> select the IOMMU when the first device is joined, and all future joins
> must be compatible with the original IOMMU - ie there is only one set
> of IOMMU capabilities in a /dev/ioasid.
Or could we still have just one /dev/ioasid but allow userspace to create
multiple gpa_ioasid_id's each associated to a different iommu domain?
Then the compatibility check will be done at ATTACH_IOASID instead of
JOIN_IOASID_FD.
This does impose one burden to userspace though, to understand the
IOMMU compatibilities and figure out which incompatible features may
affect the page table management (while such knowledge is IOMMU
vendor specific) and then explicitly manage multiple /dev/ioasid's or
multiple gpa_ioasid_id's.
Alternatively is it a good design by having the kernel return error at
attach/join time to indicate that incompatibility is detected then the
userspace should open a new /dev/ioasid or creates a new gpa_ioasid_id
for the failing device upon such failure, w/o constructing its own
compatibility knowledge?
>
> This means qemue might have multiple /dev/ioasid's if the system has
> multiple incompatible IOMMUs (is this actually a thing?) The platform
One example is Intel platform with igd. Typically there is one IOMMU
dedicated for igd and the other IOMMU serving all the remaining devices.
The igd IOMMU may not support IOMMU_CACHE while the other one
does.
> should design its IOMMU domains to minimize the number of
> /dev/ioasid's required.
>
> Is there a reason we need to share IOASID'd between completely
> divergance IOMMU implementations? I don't expect the HW should be able
> to physically share page tables??
yes, e.g. in vSVA both devices (behind divergence IOMMUs) are bound
to a single guest process which has an unique PASID and 1st-level page
table. Earlier incompatibility example is only for 2nd-level.
>
> That decision point alone might be the thing that just says we can't
> ever have /dev/vfio/vfio == /dev/ioasid
yes, unless we adopt the vfio scheme, i.e. implicitly managing incompatible
iommu domains in /dev/ioasid.
>
> > Just to confirm. Above flow is for current map/unmap flavor as what
> > VFIO/vDPA do today. Later when nested translation is supported,
> > there is no need to detach gpa_ioasid_fd. Instead, a new cmd will
> > be introduced to nest rid_ioasid_fd on top of gpa_ioasid_fd:
>
> Sure.. The tricky bit will be to define both of the common nested
> operating modes.
>
> nested_ioasid = ioctl(ioasid_fd, CREATE_NESTED_IOASID, gpa_ioasid_id);
> ioctl(ioasid_fd, SET_NESTED_IOASID_PAGE_TABLES, nested_ioasid, ..)
>
> // IOMMU will match on the device RID, no PASID:
> ioctl(vfio_device, ATTACH_IOASID, nested_ioasid);
>
> // IOMMU will match on the device RID and PASID:
> ioctl(vfio_device, ATTACH_IOASID_PASID, pasid, nested_ioasid);
I'm a bit confused here why we have both pasid and ioasid notations together.
Why not use nested_ioasid as pasid directly (i.e. every pasid in nested mode
is created by CREATE_NESTED_IOASID)?
Below I list different scenarios for ATTACH_IOASID in my view. Here
vfio_device could be a real PCI function (RID), or a subfunction device
(RID+def_ioasid). The vfio_device could be attached to a gpa_ioasid
(RID in guest view, no nesting), a nested_ioasid (RID in guest view, nesting)
or a nested_ioasid (RID+PASID in guest view, nesting).
// IOMMU will match on the device RID, no nesting, no PASID:
ioctl(vfio_device, ATTACH_IOASID, gpa_ioasid);
// IOMMU will match on the device (RID+def_ioasid), no nesting, no PASID:
ioctl(vfio_subdevice, ATTACH_IOASID, gpa_ioasid);
// IOMMU will match on the device RID, nesting, no PASID:
ioctl(vfio_device, ATTACH_IOASID, nested_ioasid);
// IOMMU will match on the device (RID+def_ioasid), nesting, no PASID:
ioctl(vfio_subdevice, ATTACH_IOASID, nested_ioasid);
// IOMMU will match on the device (RID+nested_ioasid), nesting, PASID:
ioctl(vfio_device, ATTACH_IOASID_PASID, nested_ioasid);
// IOMMU will match on the device (RID+nested_ioasid), nesting, PASID:
ioctl(vfio_subdevice, ATTACH_IOASID_PASID, nested_ioasid);
>
> Notice that ATTACH (or bind, whatever) is always done on the
> vfio_device FD. ATTACH tells the IOMMU HW to link the PCI BDF&PASID to
> a specific page table defined by an IOASID.
>
> I expect we have many flavours of IOASID tables, eg we have normal,
> and 'nested with table controlled by hypervisor'. ARM has 'nested with
> table controlled by guest' right? So like this?
>
> nested_ioasid = ioctl(ioasid_fd, CREATE_DELGATED_IOASID,
> gpa_ioasid_id, <some kind of viommu_id>)
> // PASID now goes to <viommu_id>
> ioctl(vfio_device, ATTACH_IOASID_PASID, pasid, nested_ioasid);
>
> Where <viommu_id> is some internal to the guest handle of the viommu
> page table scoped within gpa_ioasid_id? Like maybe it is GPA of the
> base of the page table?
>
> The guest can't select its own PASIDs without telling the hypervisor,
> right?
>
If the whole PASID table is delegated to the guest in ARM case, the guest
can select its own PASIDs w/o telling the hypervisor. I haven't thought
carefully about a clean way to support this scheme, e.g. if mandating
guest to always allocate PASIDs through hypervisor even in this case
would it make the uAPI simpler...
Thanks
Kevin
> From: Jason Gunthorpe <[email protected]>
> Sent: Friday, April 23, 2021 7:40 AM
>
> On Thu, Apr 22, 2021 at 04:38:08PM -0600, Alex Williamson wrote:
>
> > Because it's fundamental to the isolation of the device? What you're
> > proposing doesn't get around the group issue, it just makes it implicit
> > rather than explicit in the uapi.
>
> I'm not even sure it makes it explicit or implicit, it just takes away
> the FD.
>
> There are four group IOCTLs, I see them mapping to /dev/ioasid follows:
> VFIO_GROUP_GET_STATUS -
> + VFIO_GROUP_FLAGS_CONTAINER_SET is fairly redundant
> + VFIO_GROUP_FLAGS_VIABLE could be in a new sysfs under
> kernel/iomm_groups, or could be an IOCTL on /dev/ioasid
> IOASID_ALL_DEVICES_VIABLE
>
> VFIO_GROUP_SET_CONTAINER -
> + This happens implicitly when the device joins the IOASID
> so it gets moved to the vfio_device FD:
> ioctl(vifo_device_fd, JOIN_IOASID_FD, ioasifd)
>
> VFIO_GROUP_UNSET_CONTAINER -
> + Also moved to the vfio_device FD, opposite of JOIN_IOASID_FD
>
> VFIO_GROUP_GET_DEVICE_FD -
> + Replaced by opening /dev/vfio/deviceX
> Learn the deviceX which will be the cdev sysfs shows as:
> /sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/vfio/deviceX/dev
> Open /dev/vfio/deviceX
>
> > > How do we model the VFIO group security concept to something like
> > > VDPA?
> >
> > Is it really a "VFIO group security concept"? We're reflecting the
> > reality of the hardware, not all devices are fully isolated.
>
> Well, exactly.
>
> /dev/ioasid should understand the group concept somehow, otherwise it
> is incomplete and maybe even security broken.
>
> So, how do I add groups to, say, VDPA in a way that makes sense? The
> only answer I come to is broadly what I outlined here - make
> /dev/ioasid do all the group operations, and do them when we enjoin
> the VDPA device to the ioasid.
>
> Once I have solved all the groups problems with the non-VFIO users,
> then where does that leave VFIO? Why does VFIO need a group FD if
> everyone else doesn't?
>
> > IOMMU group. This is the reality that any userspace driver needs to
> > play in, it doesn't magically go away because we drop the group file
> > descriptor.
>
> I'm not saying it does, I'm saying it makes the uAPI more regular and
> easier to fit into /dev/ioasid without the group FD.
>
> > It only makes the uapi more difficult to use correctly because
> > userspace drivers need to go outside of the uapi to have any idea
> > that this restriction exists.
>
> I don't think it makes any substantive difference one way or the
> other.
>
> With the group FD: the userspace has to read sysfs, find the list of
> devices in the group, open the group fd, create device FDs for each
> device using the name from sysfs.
>
> Starting from a BDF the general pseudo code is
> group_path = readlink("/sys/bus/pci/devices/BDF/iommu_group")
> group_name = basename(group_path)
> group_fd = open("/dev/vfio/"+group_name)
> device_fd = ioctl(VFIO_GROUP_GET_DEVICE_FD, BDF);
>
> Without the group FD: the userspace has to read sysfs, find the list
> of devices in the group and then open the device-specific cdev (found
> via sysfs) and link them to a /dev/ioasid FD.
>
> Starting from a BDF the general pseudo code is:
> device_name = first_directory_of("/sys/bus/pci/devices/BDF/vfio/")
> device_fd = open("/dev/vfio/"+device_name)
> ioasidfd = open("/dev/ioasid")
> ioctl(device_fd, JOIN_IOASID_FD, ioasidfd)
>
> These two routes can have identical outcomes and identical security
> checks.
>
> In both cases if userspace wants a list of BDFs in the same group as
> the BDF it is interested in:
> readdir("/sys/bus/pci/devices/BDF/iommu_group/devices")
>
> It seems like a very small difference to me.
>
> I still don't see how the group restriction gets surfaced to the
> application through the group FD. The applications I looked through
> just treat the group FD as a step on their way to get the device_fd.
>
So your proposal sort of moves the entire container/group/domain
managment into /dev/ioasid and then leaves vfio only provide device
specific uAPI. An ioasid represents a page table (address space), thus
is equivalent to the scope of VFIO container. Having the device join
an ioasid is equivalent to attaching a device to VFIO container, and
here the group integrity must be enforced. Then /dev/ioasid anyway
needs to manage group objects and their association with ioasid and
underlying iommu domain thus it's pointless to keep same logic within
VFIO. Is this understanding correct?
btw one remaining open is whether you expect /dev/ioasid to be
associated with a single iommu domain, or multiple. If only a single
domain is allowed, the ioasid_fd is equivalent to the scope of VFIO
container. It is supposed to have only one gpa_ioasid_id since one
iommu domain can only have a single 2nd level pgtable. Then all other
ioasids, once allocated, must be nested on this gpa_ioasid_id to fit
in the same domain. if a legacy vIOMMU is exposed (which disallows
nesting), the userspace has to open an ioasid_fd for every group.
This is basically the VFIO way. On the other hand if multiple domains
is allowed, there could be multiple ioasid_ids each holding a 2nd level
pgtable and an iommu domain (or a list of pgtables and domains due to
incompatibility issue as discussed in another thread), and can be
nested by other ioasids respectively. The application only needs
to open /dev/ioasid once regardless of whether vIOMMU allows
nesting, and has a single interface for ioasid allocation. Which way
do you prefer to?
Thanks
Kevin
On Fri, Apr 23, 2021 at 09:06:44AM +0000, Tian, Kevin wrote:
> Or could we still have just one /dev/ioasid but allow userspace to create
> multiple gpa_ioasid_id's each associated to a different iommu domain?
> Then the compatibility check will be done at ATTACH_IOASID instead of
> JOIN_IOASID_FD.
To my mind what makes sense that that /dev/ioasid presents a single
IOMMU behavior that is basically the same. This may ultimately not be
what we call a domain today.
We may end up with a middle object which is a group of domains that
all have the same capabilities, and we define capabilities in a way
that most platforms have a single group of domains.
The key capability of a group of domains is they can all share the HW
page table representation, so if an IOASID instantiates a page table
it can be assigned to any device on any domain in the gruop of domains.
If you try to say that /dev/ioasid has many domains and they can't
have their HW page tables shared then I think the implementation
complexity will explode.
> This does impose one burden to userspace though, to understand the
> IOMMU compatibilities and figure out which incompatible features may
> affect the page table management (while such knowledge is IOMMU
> vendor specific) and then explicitly manage multiple /dev/ioasid's or
> multiple gpa_ioasid_id's.
Right, this seems very hard in the general case..
> Alternatively is it a good design by having the kernel return error at
> attach/join time to indicate that incompatibility is detected then the
> userspace should open a new /dev/ioasid or creates a new gpa_ioasid_id
> for the failing device upon such failure, w/o constructing its own
> compatibility knowledge?
Yes, this feels workable too
> > This means qemue might have multiple /dev/ioasid's if the system has
> > multiple incompatible IOMMUs (is this actually a thing?) The platform
>
> One example is Intel platform with igd. Typically there is one IOMMU
> dedicated for igd and the other IOMMU serving all the remaining devices.
> The igd IOMMU may not support IOMMU_CACHE while the other one
> does.
If we can do as above the two domains may be in the same group of
domains and the IOMMU_CACHE is not exposed at the /dev/ioasid level.
For instance the API could specifiy IOMMU_CACHE during attach, not
during IOASID creation.
Getting all the data model right in the API is going to be trickiest
part of this.
> yes, e.g. in vSVA both devices (behind divergence IOMMUs) are bound
> to a single guest process which has an unique PASID and 1st-level page
> table. Earlier incompatibility example is only for 2nd-level.
Because when we get to here, things become inscrutable as an API if
you are trying to say two different IOMMU presentations can actually
be nested.
> > Sure.. The tricky bit will be to define both of the common nested
> > operating modes.
> >
> > nested_ioasid = ioctl(ioasid_fd, CREATE_NESTED_IOASID, gpa_ioasid_id);
> > ioctl(ioasid_fd, SET_NESTED_IOASID_PAGE_TABLES, nested_ioasid, ..)
> >
> > // IOMMU will match on the device RID, no PASID:
> > ioctl(vfio_device, ATTACH_IOASID, nested_ioasid);
> >
> > // IOMMU will match on the device RID and PASID:
> > ioctl(vfio_device, ATTACH_IOASID_PASID, pasid, nested_ioasid);
>
> I'm a bit confused here why we have both pasid and ioasid notations together.
> Why not use nested_ioasid as pasid directly (i.e. every pasid in nested mode
> is created by CREATE_NESTED_IOASID)?
The IOASID is not a PASID, it is just a page table.
A generic IOMMU matches on either RID or (RID,PASID), so you should
specify the PASID when establishing the match.
IOASID only specifies the page table.
So you read the above as configuring the path
PCI_DEVICE -> (RID,PASID) -> nested_ioasid -> gpa_ioasid_id -> physical
Where (RID,PASID) indicate values taken from the PCI packet.
In principle the IOMMU could also be commanded to reuse the same
ioasid page table with a different PASID:
PCI_DEVICE_B -> (RID_B,PASID_B) -> nested_ioasid -> gpa_ioasid_id -> physical
This is impossible if the ioasid == PASID in the API.
> Below I list different scenarios for ATTACH_IOASID in my view. Here
> vfio_device could be a real PCI function (RID), or a subfunction device
> (RID+def_ioasid).
What is RID+def_ioasid? The IOMMU does not match on IOASID's.
A subfunction device always need to use PASID, or an internal IOMMU,
confused what you are trying to explain?
> If the whole PASID table is delegated to the guest in ARM case, the guest
> can select its own PASIDs w/o telling the hypervisor.
The hypervisor has to route the PASID's to the guest at some point - a
guest can't just claim a PASID unilaterally, that would not be secure.
If it is not done with per-PASID hypercalls then the hypervisor has to
route all PASID's for a RID to the guest and /dev/ioasid needs to have
a nested IOASID object that represents this connection - ie it points
to the PASID table of the guest vIOMMU or something.
Remember this all has to be compatible with mdev's too and without
hypercalls to create PASIDs that will be hard: mdev sharing a RID and
slicing the physical PASIDs can't support a 'send all PASIDs to the
guest' model, or even a 'the guest gets to pick the PASID' option.
Jason
On Fri, Apr 23, 2021 at 10:31:46AM +0000, Tian, Kevin wrote:
> So your proposal sort of moves the entire container/group/domain
> managment into /dev/ioasid and then leaves vfio only provide device
> specific uAPI. An ioasid represents a page table (address space), thus
> is equivalent to the scope of VFIO container. Having the device join
> an ioasid is equivalent to attaching a device to VFIO container, and
> here the group integrity must be enforced. Then /dev/ioasid anyway
> needs to manage group objects and their association with ioasid and
> underlying iommu domain thus it's pointless to keep same logic within
> VFIO. Is this understanding correct?
Yes, I haven't thought of a way to define /dev/ioasid in a way that is
useful to VDPA/etc without all these parts.. If you come up with a
better idea do share.
> btw one remaining open is whether you expect /dev/ioasid to be
> associated with a single iommu domain, or multiple. If only a single
> domain is allowed, the ioasid_fd is equivalent to the scope of VFIO
> container.
See the prior email for a more complete set of thoughts on this.
> It is supposed to have only one gpa_ioasid_id since one iommu domain
> can only have a single 2nd level pgtable. Then all other ioasids,
> once allocated, must be nested on this gpa_ioasid_id to fit in the
> same domain. if a legacy vIOMMU is exposed (which disallows
> nesting), the userspace has to open an ioasid_fd for every group.
> This is basically the VFIO way. On the other hand if multiple
> domains is allowed, there could be multiple ioasid_ids each holding
> a 2nd level pgtable and an iommu domain (or a list of pgtables and
> domains due to incompatibility issue as discussed in another
> thread), and can be nested by other ioasids respectively. The
> application only needs to open /dev/ioasid once regardless of
> whether vIOMMU allows nesting, and has a single interface for ioasid
> allocation. Which way do you prefer to?
I have a feeling we want to have a single IOASID be usable in as many
contexts as possible - as many domains, devices and groups as we can
get away with.
The IOASID is the expensive object, it is the pagetable, it is
potentially a lot of memory. The API should be designed so we don't
have to have multiple copies of the same pagetable.
For this reason I think having multiple IOASID's in a single
/dev/ioasid container is the way to go.
To my mind the /dev/ioasid should be linked to a HW page table format
and any device/domain/group that uses that same HW page table format
can be joined to it. This implies we can have multiple domains under
/dev/ioasid, but there is a limitation on what domains can be grouped
together.
This probably does not match the exact IOMMU capability/domain model
we have today, so I present it as an inspirational goal. The other
tricky thing here will be to define small steps..
eg V1 may only allow one domain, but the uAPI will not reflect this as
we expect V2 will allow multiple domains..
Jason
On Thu, 22 Apr 2021 20:39:50 -0300
Jason Gunthorpe <[email protected]> wrote:
> On Thu, Apr 22, 2021 at 04:38:08PM -0600, Alex Williamson wrote:
>
> > Because it's fundamental to the isolation of the device? What you're
> > proposing doesn't get around the group issue, it just makes it implicit
> > rather than explicit in the uapi.
>
> I'm not even sure it makes it explicit or implicit, it just takes away
> the FD.
>
> There are four group IOCTLs, I see them mapping to /dev/ioasid follows:
> VFIO_GROUP_GET_STATUS -
> + VFIO_GROUP_FLAGS_CONTAINER_SET is fairly redundant
> + VFIO_GROUP_FLAGS_VIABLE could be in a new sysfs under
> kernel/iomm_groups, or could be an IOCTL on /dev/ioasid
> IOASID_ALL_DEVICES_VIABLE
>
> VFIO_GROUP_SET_CONTAINER -
> + This happens implicitly when the device joins the IOASID
> so it gets moved to the vfio_device FD:
> ioctl(vifo_device_fd, JOIN_IOASID_FD, ioasifd)
>
> VFIO_GROUP_UNSET_CONTAINER -
> + Also moved to the vfio_device FD, opposite of JOIN_IOASID_FD
>
> VFIO_GROUP_GET_DEVICE_FD -
> + Replaced by opening /dev/vfio/deviceX
> Learn the deviceX which will be the cdev sysfs shows as:
> /sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/vfio/deviceX/dev
> Open /dev/vfio/deviceX
>
> > > How do we model the VFIO group security concept to something like
> > > VDPA?
> >
> > Is it really a "VFIO group security concept"? We're reflecting the
> > reality of the hardware, not all devices are fully isolated.
>
> Well, exactly.
>
> /dev/ioasid should understand the group concept somehow, otherwise it
> is incomplete and maybe even security broken.
>
> So, how do I add groups to, say, VDPA in a way that makes sense? The
> only answer I come to is broadly what I outlined here - make
> /dev/ioasid do all the group operations, and do them when we enjoin
> the VDPA device to the ioasid.
>
> Once I have solved all the groups problems with the non-VFIO users,
> then where does that leave VFIO? Why does VFIO need a group FD if
> everyone else doesn't?
This assumes there's a solution for vDPA that doesn't just ignore the
problem and hope for the best. I can't speak to a vDPA solution.
> > IOMMU group. This is the reality that any userspace driver needs to
> > play in, it doesn't magically go away because we drop the group file
> > descriptor.
>
> I'm not saying it does, I'm saying it makes the uAPI more regular and
> easier to fit into /dev/ioasid without the group FD.
>
> > It only makes the uapi more difficult to use correctly because
> > userspace drivers need to go outside of the uapi to have any idea
> > that this restriction exists.
>
> I don't think it makes any substantive difference one way or the
> other.
>
> With the group FD: the userspace has to read sysfs, find the list of
> devices in the group, open the group fd, create device FDs for each
> device using the name from sysfs.
>
> Starting from a BDF the general pseudo code is
> group_path = readlink("/sys/bus/pci/devices/BDF/iommu_group")
> group_name = basename(group_path)
> group_fd = open("/dev/vfio/"+group_name)
> device_fd = ioctl(VFIO_GROUP_GET_DEVICE_FD, BDF);
>
> Without the group FD: the userspace has to read sysfs, find the list
> of devices in the group and then open the device-specific cdev (found
> via sysfs) and link them to a /dev/ioasid FD.
>
> Starting from a BDF the general pseudo code is:
> device_name = first_directory_of("/sys/bus/pci/devices/BDF/vfio/")
> device_fd = open("/dev/vfio/"+device_name)
> ioasidfd = open("/dev/ioasid")
> ioctl(device_fd, JOIN_IOASID_FD, ioasidfd)
This is exactly the implicit vs explicit semantics. In the existing
vfio case, the user needs to explicitly interact with the group. In
your proposal, the user interacts with the device, the group concept is
an implicit restriction. You've listed a step in the description about
a "list of devices in the group", but nothing in the pseudo code
reflects that step. I expect it would be a subtly missed by any
userspace driver developer unless they happen to work on a system where
the grouping is not ideal. I think that makes the interface hard to
use correctly.
> These two routes can have identical outcomes and identical security
> checks.
>
> In both cases if userspace wants a list of BDFs in the same group as
> the BDF it is interested in:
> readdir("/sys/bus/pci/devices/BDF/iommu_group/devices")
>
> It seems like a very small difference to me.
The difference is that the group becomes a nuance, that I expect would
be ignored, rather than a first class concept in the API.
> I still don't see how the group restriction gets surfaced to the
> application through the group FD. The applications I looked through
> just treat the group FD as a step on their way to get the device_fd.
A step where the developer hopefully recognizes that there might be
other devices in a group, a group can't be opened more than once, the
group has a flag indicating viability, they can't actually get the
device fd until the group is attached to an IOMMU context, all devices
in the group therefore have the same IOMMU context, and device fd isn't
actually available until they've gone through a proper setup, which is
an additional layer of protection. A userspace vfio developer may not
understand why a group isn't viable, but they have that clue that it's
something at the group level to investigate.
Most of the userspace vfio drivers that I haven't contributed myself
have come about with little or no interaction from me, so I'd like to
think that the vfio uapi is actually somewhat intuitive in its concepts
and difficult to use incorrectly. Thanks,
Alex
On Fri, Apr 23, 2021 at 10:38:51AM -0600, Alex Williamson wrote:
> On Thu, 22 Apr 2021 20:39:50 -0300
> > /dev/ioasid should understand the group concept somehow, otherwise it
> > is incomplete and maybe even security broken.
> >
> > So, how do I add groups to, say, VDPA in a way that makes sense? The
> > only answer I come to is broadly what I outlined here - make
> > /dev/ioasid do all the group operations, and do them when we enjoin
> > the VDPA device to the ioasid.
> >
> > Once I have solved all the groups problems with the non-VFIO users,
> > then where does that leave VFIO? Why does VFIO need a group FD if
> > everyone else doesn't?
>
> This assumes there's a solution for vDPA that doesn't just ignore the
> problem and hope for the best. I can't speak to a vDPA solution.
I don't think we can just ignore the question and succeed with
/dev/ioasid.
Guess it should get answered as best it can for ioasid "in general"
then we can decide if it makes sense for VFIO to use the group FD or
not when working in ioasid mode.
Maybe a better idea will come up
> an implicit restriction. You've listed a step in the description about
> a "list of devices in the group", but nothing in the pseudo code
> reflects that step.
I gave it below with the readdir() - it isn't in the pseudo code
because the applications I looked through didn't use it, and wouldn't
benefit from it. I tried to show what things were doing today.
> I expect it would be a subtly missed by any userspace driver
> developer unless they happen to work on a system where the grouping
> is not ideal.
I'm still unclear - what are be the consequence if the application
designer misses the group detail?
Jason
> From: Jason Gunthorpe <[email protected]>
> Sent: Friday, April 23, 2021 7:50 PM
>
> On Fri, Apr 23, 2021 at 09:06:44AM +0000, Tian, Kevin wrote:
>
> > Or could we still have just one /dev/ioasid but allow userspace to create
> > multiple gpa_ioasid_id's each associated to a different iommu domain?
> > Then the compatibility check will be done at ATTACH_IOASID instead of
> > JOIN_IOASID_FD.
>
> To my mind what makes sense that that /dev/ioasid presents a single
> IOMMU behavior that is basically the same. This may ultimately not be
> what we call a domain today.
>
> We may end up with a middle object which is a group of domains that
> all have the same capabilities, and we define capabilities in a way
> that most platforms have a single group of domains.
>
> The key capability of a group of domains is they can all share the HW
> page table representation, so if an IOASID instantiates a page table
> it can be assigned to any device on any domain in the gruop of domains.
Sorry that I didn't quite get it. If a group of domains can share the
same page table then why not just attaching all devices under those
domains into a single domain? IMO the iommu domain is introduced
to describe the HW page table. Ideally a new iommu domain should
be created only when it's impossible to share an existing page table.
Otherwise you'll get bad iotlb efficiency because each domain has its
unique domain id (tagged in iotlb) then duplicated iotlb entries may
exist even when a single page table is shared by those domains.
Then does it imply that you are actually suggesting /dev/ioasid associated
with a single 2nd-level page table (which can be nested by multiple
1st-level page tables represented by other ioasids) thus a single iommu
domain for all devices linked to compatible IOMMUs?
Or, can you elaborate what is the targeted usage by having a group of
domains which all share the same page table?
>
> If you try to say that /dev/ioasid has many domains and they can't
> have their HW page tables shared then I think the implementation
> complexity will explode.
Want to hear your opinion for one open here. There is no doubt that
an ioasid represents a HW page table when the table is constructed by
userspace and then linked to the IOMMU through the bind/unbind
API. But I'm not very sure about whether an ioasid should represent
the exact pgtable or the mapping metadata when the underlying
pgtable is indirectly constructed through map/unmap API. VFIO does
the latter way, which is why it allows multiple incompatible domains
in a single container which all share the same mapping metadata.
>
> > This does impose one burden to userspace though, to understand the
> > IOMMU compatibilities and figure out which incompatible features may
> > affect the page table management (while such knowledge is IOMMU
> > vendor specific) and then explicitly manage multiple /dev/ioasid's or
> > multiple gpa_ioasid_id's.
>
> Right, this seems very hard in the general case..
>
> > Alternatively is it a good design by having the kernel return error at
> > attach/join time to indicate that incompatibility is detected then the
> > userspace should open a new /dev/ioasid or creates a new gpa_ioasid_id
> > for the failing device upon such failure, w/o constructing its own
> > compatibility knowledge?
>
> Yes, this feels workable too
>
> > > This means qemue might have multiple /dev/ioasid's if the system has
> > > multiple incompatible IOMMUs (is this actually a thing?) The platform
> >
> > One example is Intel platform with igd. Typically there is one IOMMU
> > dedicated for igd and the other IOMMU serving all the remaining devices.
> > The igd IOMMU may not support IOMMU_CACHE while the other one
> > does.
>
> If we can do as above the two domains may be in the same group of
> domains and the IOMMU_CACHE is not exposed at the /dev/ioasid level.
>
> For instance the API could specifiy IOMMU_CACHE during attach, not
> during IOASID creation.
>
> Getting all the data model right in the API is going to be trickiest
> part of this.
>
> > yes, e.g. in vSVA both devices (behind divergence IOMMUs) are bound
> > to a single guest process which has an unique PASID and 1st-level page
> > table. Earlier incompatibility example is only for 2nd-level.
>
> Because when we get to here, things become inscrutable as an API if
> you are trying to say two different IOMMU presentations can actually
> be nested.
>
> > > Sure.. The tricky bit will be to define both of the common nested
> > > operating modes.
> > >
> > > nested_ioasid = ioctl(ioasid_fd, CREATE_NESTED_IOASID,
> gpa_ioasid_id);
> > > ioctl(ioasid_fd, SET_NESTED_IOASID_PAGE_TABLES, nested_ioasid, ..)
> > >
> > > // IOMMU will match on the device RID, no PASID:
> > > ioctl(vfio_device, ATTACH_IOASID, nested_ioasid);
> > >
> > > // IOMMU will match on the device RID and PASID:
> > > ioctl(vfio_device, ATTACH_IOASID_PASID, pasid, nested_ioasid);
> >
> > I'm a bit confused here why we have both pasid and ioasid notations
> together.
> > Why not use nested_ioasid as pasid directly (i.e. every pasid in nested
> mode
> > is created by CREATE_NESTED_IOASID)?
>
> The IOASID is not a PASID, it is just a page table.
>
> A generic IOMMU matches on either RID or (RID,PASID), so you should
> specify the PASID when establishing the match.
>
> IOASID only specifies the page table.
>
> So you read the above as configuring the path
>
> PCI_DEVICE -> (RID,PASID) -> nested_ioasid -> gpa_ioasid_id -> physical
>
> Where (RID,PASID) indicate values taken from the PCI packet.
>
> In principle the IOMMU could also be commanded to reuse the same
> ioasid page table with a different PASID:
>
> PCI_DEVICE_B -> (RID_B,PASID_B) -> nested_ioasid -> gpa_ioasid_id ->
> physical
>
> This is impossible if the ioasid == PASID in the API.
OK, now I see where the disconnection comes from. In my context ioasid
is the identifier that is actually used in the wire, but seems you treat it as
a sw-defined namespace purely for representing page tables. We should
clear this concept first before further discussing other details. ????
Below is the description when the kernel ioasid allocator was introduced:
--
commit fa83433c92e340822a056a610a4fa2063a3db304
Author: Jean-Philippe Brucker <[email protected]>
Date: Wed Oct 2 12:42:41 2019 -0700
iommu: Add I/O ASID allocator
Some devices might support multiple DMA address spaces, in particular
those that have the PCI PASID feature. PASID (Process Address Space ID)
allows to share process address spaces with devices (SVA), partition a
device into VM-assignable entities (VFIO mdev) or simply provide
multiple DMA address space to kernel drivers. Add a global PASID
allocator usable by different drivers at the same time. Name it I/O ASID
to avoid confusion with ASIDs allocated by arch code, which are usually
a separate ID space.
The IOASID space is global. Each device can have its own PASID space,
but by convention the IOMMU ended up having a global PASID space, so
that with SVA, each mm_struct is associated to a single PASID.
The allocator is primarily used by IOMMU subsystem but in rare occasions
drivers would like to allocate PASIDs for devices that aren't managed by
an IOMMU, using the same ID space as IOMMU.
--
ioasid and pasid are used interchangeably within the kernel, and the ioasid
value returned by the ioasid allocator is directly used as PASID when the
driver programs IOMMU and device, respectively. My context is based on
this understanding, which is why I thought nested_ioasid can be directly
used as PASID in earlier reply. Do you see a problem with this approach?
Then following your proposal, does it mean that we need another interface
for allocating PASID? and since ioasid means different thing in uAPI and
in-kernel API, possibly a new name is required to avoid confusion?
>
> > Below I list different scenarios for ATTACH_IOASID in my view. Here
> > vfio_device could be a real PCI function (RID), or a subfunction device
> > (RID+def_ioasid).
>
> What is RID+def_ioasid? The IOMMU does not match on IOASID's.
>
> A subfunction device always need to use PASID, or an internal IOMMU,
> confused what you are trying to explain?
Here the def_ioasid is the PASID that is associated with the subfunction
in my context with above explanation.
>
> > If the whole PASID table is delegated to the guest in ARM case, the guest
> > can select its own PASIDs w/o telling the hypervisor.
>
> The hypervisor has to route the PASID's to the guest at some point - a
> guest can't just claim a PASID unilaterally, that would not be secure.
>
> If it is not done with per-PASID hypercalls then the hypervisor has to
> route all PASID's for a RID to the guest and /dev/ioasid needs to have
> a nested IOASID object that represents this connection - ie it points
> to the PASID table of the guest vIOMMU or something.
yes, this might be the model that will work for ARM case. In their
architecture the PASID table locates in the GPA space thus naturally
to be managed by the guest (though Jean ever mentioned some
tricky method to allow the host managing it by stealing a GPA window).
>
> Remember this all has to be compatible with mdev's too and without
> hypercalls to create PASIDs that will be hard: mdev sharing a RID and
> slicing the physical PASIDs can't support a 'send all PASIDs to the
> guest' model, or even a 'the guest gets to pick the PASID' option.
>
yes, with mdev above is inevitable. I guess ARM may need some
similar extension in their SMMU as what VT-d provide for mdev
usage, e.g. at least not mandating PASID table in GPA space. But
before that they may still expect a way to delegate the whole
PASID space per RID to the guest.
Really lots of subtle differences to be generalized...
Thanks
Kevin
On Sun, Apr 25, 2021 at 09:24:46AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Friday, April 23, 2021 7:50 PM
> >
> > On Fri, Apr 23, 2021 at 09:06:44AM +0000, Tian, Kevin wrote:
> >
> > > Or could we still have just one /dev/ioasid but allow userspace to create
> > > multiple gpa_ioasid_id's each associated to a different iommu domain?
> > > Then the compatibility check will be done at ATTACH_IOASID instead of
> > > JOIN_IOASID_FD.
> >
> > To my mind what makes sense that that /dev/ioasid presents a single
> > IOMMU behavior that is basically the same. This may ultimately not be
> > what we call a domain today.
> >
> > We may end up with a middle object which is a group of domains that
> > all have the same capabilities, and we define capabilities in a way
> > that most platforms have a single group of domains.
> >
> > The key capability of a group of domains is they can all share the HW
> > page table representation, so if an IOASID instantiates a page table
> > it can be assigned to any device on any domain in the gruop of domains.
>
> Sorry that I didn't quite get it. If a group of domains can share the
> same page table then why not just attaching all devices under those
> domains into a single domain?
Sure, if that works. But you shouldn't have things like IOMMU_CACHE
create different domains or trigger different /dev/ioasid's
> to describe the HW page table. Ideally a new iommu domain should
> be created only when it's impossible to share an existing page table.
> Otherwise you'll get bad iotlb efficiency because each domain has its
> unique domain id (tagged in iotlb) then duplicated iotlb entries may
> exist even when a single page table is shared by those domains.
Right, fewer is better
> Or, can you elaborate what is the targeted usage by having a group of
> domains which all share the same page table?
You just need to have clear rule what what requires a new /dev/ioasid
FD - and if it maps to domains then great.
> Want to hear your opinion for one open here. There is no doubt that
> an ioasid represents a HW page table when the table is constructed by
> userspace and then linked to the IOMMU through the bind/unbind
> API. But I'm not very sure about whether an ioasid should represent
> the exact pgtable or the mapping metadata when the underlying
> pgtable is indirectly constructed through map/unmap API. VFIO does
> the latter way, which is why it allows multiple incompatible domains
> in a single container which all share the same mapping metadata.
I think VFIO's map/unmap is way too complex and we know it has bad
performance problems.
If /dev/ioasid is single HW page table only then I would focus on that
implementation and leave it to userspace to span different
/dev/ioasids if needed.
> OK, now I see where the disconnection comes from. In my context ioasid
> is the identifier that is actually used in the wire, but seems you treat it as
> a sw-defined namespace purely for representing page tables. We should
> clear this concept first before further discussing other details. ????
There is no general HW requirement that every IO page table be
referred to by the same PASID and this API would have to support
non-PASID IO page tables as well. So I'd keep the two things
separated in the uAPI - even though the kernel today has a global
PASID pool.
> Then following your proposal, does it mean that we need another
> interface for allocating PASID? and since ioasid means different
> thing in uAPI and in-kernel API, possibly a new name is required to
> avoid confusion?
I would suggest have two ways to control the PASID
1) Over /dev/ioasid allocate a PASID for an IOASID. All future PASID
based usages of the IOASID will use that global PASID
2) Over the device FD, when the IOASID is bound return the PASID that
was selected. If the IOASID does not have a global PASID then the
kernel is free to make something up. In this mode a single IOASID
can have multiple PASIDs.
Simple things like DPDK can use #2 and potentially have better PASID
limits. hypervisors will most likely have to use #1, but it depends on
how their vIOMMU interface works.
I think the name IOASID is fine for the uAPI, the kernel version can
be called ioasid_id or something.
(also looking at ioasid.c, why do we need such a thin and odd wrapper
around xarray?)
Jason
On Fri, Apr 23, 2021 at 10:31:46AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Friday, April 23, 2021 7:40 AM
> >
> > On Thu, Apr 22, 2021 at 04:38:08PM -0600, Alex Williamson wrote:
> >
> > > Because it's fundamental to the isolation of the device? What you're
> > > proposing doesn't get around the group issue, it just makes it implicit
> > > rather than explicit in the uapi.
> >
> > I'm not even sure it makes it explicit or implicit, it just takes away
> > the FD.
> >
> > There are four group IOCTLs, I see them mapping to /dev/ioasid follows:
> > VFIO_GROUP_GET_STATUS -
> > + VFIO_GROUP_FLAGS_CONTAINER_SET is fairly redundant
> > + VFIO_GROUP_FLAGS_VIABLE could be in a new sysfs under
> > kernel/iomm_groups, or could be an IOCTL on /dev/ioasid
> > IOASID_ALL_DEVICES_VIABLE
> >
> > VFIO_GROUP_SET_CONTAINER -
> > + This happens implicitly when the device joins the IOASID
> > so it gets moved to the vfio_device FD:
> > ioctl(vifo_device_fd, JOIN_IOASID_FD, ioasifd)
> >
> > VFIO_GROUP_UNSET_CONTAINER -
> > + Also moved to the vfio_device FD, opposite of JOIN_IOASID_FD
> >
> > VFIO_GROUP_GET_DEVICE_FD -
> > + Replaced by opening /dev/vfio/deviceX
> > Learn the deviceX which will be the cdev sysfs shows as:
> > /sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/vfio/deviceX/dev
> > Open /dev/vfio/deviceX
> >
> > > > How do we model the VFIO group security concept to something like
> > > > VDPA?
> > >
> > > Is it really a "VFIO group security concept"? We're reflecting the
> > > reality of the hardware, not all devices are fully isolated.
> >
> > Well, exactly.
> >
> > /dev/ioasid should understand the group concept somehow, otherwise it
> > is incomplete and maybe even security broken.
> >
> > So, how do I add groups to, say, VDPA in a way that makes sense? The
> > only answer I come to is broadly what I outlined here - make
> > /dev/ioasid do all the group operations, and do them when we enjoin
> > the VDPA device to the ioasid.
> >
> > Once I have solved all the groups problems with the non-VFIO users,
> > then where does that leave VFIO? Why does VFIO need a group FD if
> > everyone else doesn't?
> >
> > > IOMMU group. This is the reality that any userspace driver needs to
> > > play in, it doesn't magically go away because we drop the group file
> > > descriptor.
> >
> > I'm not saying it does, I'm saying it makes the uAPI more regular and
> > easier to fit into /dev/ioasid without the group FD.
> >
> > > It only makes the uapi more difficult to use correctly because
> > > userspace drivers need to go outside of the uapi to have any idea
> > > that this restriction exists.
> >
> > I don't think it makes any substantive difference one way or the
> > other.
> >
> > With the group FD: the userspace has to read sysfs, find the list of
> > devices in the group, open the group fd, create device FDs for each
> > device using the name from sysfs.
> >
> > Starting from a BDF the general pseudo code is
> > group_path = readlink("/sys/bus/pci/devices/BDF/iommu_group")
> > group_name = basename(group_path)
> > group_fd = open("/dev/vfio/"+group_name)
> > device_fd = ioctl(VFIO_GROUP_GET_DEVICE_FD, BDF);
> >
> > Without the group FD: the userspace has to read sysfs, find the list
> > of devices in the group and then open the device-specific cdev (found
> > via sysfs) and link them to a /dev/ioasid FD.
> >
> > Starting from a BDF the general pseudo code is:
> > device_name = first_directory_of("/sys/bus/pci/devices/BDF/vfio/")
> > device_fd = open("/dev/vfio/"+device_name)
> > ioasidfd = open("/dev/ioasid")
> > ioctl(device_fd, JOIN_IOASID_FD, ioasidfd)
> >
> > These two routes can have identical outcomes and identical security
> > checks.
> >
> > In both cases if userspace wants a list of BDFs in the same group as
> > the BDF it is interested in:
> > readdir("/sys/bus/pci/devices/BDF/iommu_group/devices")
> >
> > It seems like a very small difference to me.
> >
> > I still don't see how the group restriction gets surfaced to the
> > application through the group FD. The applications I looked through
> > just treat the group FD as a step on their way to get the device_fd.
> >
>
> So your proposal sort of moves the entire container/group/domain
> managment into /dev/ioasid and then leaves vfio only provide device
> specific uAPI. An ioasid represents a page table (address space), thus
> is equivalent to the scope of VFIO container.
Right. I don't really know how /dev/iosasid is supposed to work, and
so far I don't see how it conceptually differs from a container. What
is it adding?
> Having the device join
> an ioasid is equivalent to attaching a device to VFIO container, and
> here the group integrity must be enforced. Then /dev/ioasid anyway
> needs to manage group objects and their association with ioasid and
> underlying iommu domain thus it's pointless to keep same logic within
> VFIO. Is this understanding correct?
>
> btw one remaining open is whether you expect /dev/ioasid to be
> associated with a single iommu domain, or multiple. If only a single
> domain is allowed, the ioasid_fd is equivalent to the scope of VFIO
> container. It is supposed to have only one gpa_ioasid_id since one
> iommu domain can only have a single 2nd level pgtable. Then all other
> ioasids, once allocated, must be nested on this gpa_ioasid_id to fit
> in the same domain. if a legacy vIOMMU is exposed (which disallows
> nesting), the userspace has to open an ioasid_fd for every group.
> This is basically the VFIO way. On the other hand if multiple domains
> is allowed, there could be multiple ioasid_ids each holding a 2nd level
> pgtable and an iommu domain (or a list of pgtables and domains due to
> incompatibility issue as discussed in another thread), and can be
> nested by other ioasids respectively. The application only needs
> to open /dev/ioasid once regardless of whether vIOMMU allows
> nesting, and has a single interface for ioasid allocation. Which way
> do you prefer to?
>
> Thanks
> Kevin
>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Thu, Apr 22, 2021 at 08:39:50PM -0300, Jason Gunthorpe wrote:
> On Thu, Apr 22, 2021 at 04:38:08PM -0600, Alex Williamson wrote:
>
> > Because it's fundamental to the isolation of the device? What you're
> > proposing doesn't get around the group issue, it just makes it implicit
> > rather than explicit in the uapi.
>
> I'm not even sure it makes it explicit or implicit, it just takes away
> the FD.
>
> There are four group IOCTLs, I see them mapping to /dev/ioasid follows:
> VFIO_GROUP_GET_STATUS -
> + VFIO_GROUP_FLAGS_CONTAINER_SET is fairly redundant
> + VFIO_GROUP_FLAGS_VIABLE could be in a new sysfs under
> kernel/iomm_groups, or could be an IOCTL on /dev/ioasid
> IOASID_ALL_DEVICES_VIABLE
>
> VFIO_GROUP_SET_CONTAINER -
> + This happens implicitly when the device joins the IOASID
> so it gets moved to the vfio_device FD:
> ioctl(vifo_device_fd, JOIN_IOASID_FD, ioasifd)
>
> VFIO_GROUP_UNSET_CONTAINER -
> + Also moved to the vfio_device FD, opposite of JOIN_IOASID_FD
>
> VFIO_GROUP_GET_DEVICE_FD -
> + Replaced by opening /dev/vfio/deviceX
> Learn the deviceX which will be the cdev sysfs shows as:
> /sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/vfio/deviceX/dev
> Open /dev/vfio/deviceX
>
> > > How do we model the VFIO group security concept to something like
> > > VDPA?
> >
> > Is it really a "VFIO group security concept"? We're reflecting the
> > reality of the hardware, not all devices are fully isolated.
>
> Well, exactly.
>
> /dev/ioasid should understand the group concept somehow, otherwise it
> is incomplete and maybe even security broken.
>
> So, how do I add groups to, say, VDPA in a way that makes sense? The
> only answer I come to is broadly what I outlined here - make
> /dev/ioasid do all the group operations, and do them when we enjoin
> the VDPA device to the ioasid.
>
> Once I have solved all the groups problems with the non-VFIO users,
> then where does that leave VFIO? Why does VFIO need a group FD if
> everyone else doesn't?
>
> > IOMMU group. This is the reality that any userspace driver needs to
> > play in, it doesn't magically go away because we drop the group file
> > descriptor.
>
> I'm not saying it does, I'm saying it makes the uAPI more regular and
> easier to fit into /dev/ioasid without the group FD.
>
> > It only makes the uapi more difficult to use correctly because
> > userspace drivers need to go outside of the uapi to have any idea
> > that this restriction exists.
>
> I don't think it makes any substantive difference one way or the
> other.
>
> With the group FD: the userspace has to read sysfs, find the list of
> devices in the group, open the group fd, create device FDs for each
> device using the name from sysfs.
>
> Starting from a BDF the general pseudo code is
> group_path = readlink("/sys/bus/pci/devices/BDF/iommu_group")
> group_name = basename(group_path)
> group_fd = open("/dev/vfio/"+group_name)
> device_fd = ioctl(VFIO_GROUP_GET_DEVICE_FD, BDF);
>
> Without the group FD: the userspace has to read sysfs, find the list
> of devices in the group and then open the device-specific cdev (found
> via sysfs) and link them to a /dev/ioasid FD.
>
> Starting from a BDF the general pseudo code is:
> device_name = first_directory_of("/sys/bus/pci/devices/BDF/vfio/")
> device_fd = open("/dev/vfio/"+device_name)
> ioasidfd = open("/dev/ioasid")
> ioctl(device_fd, JOIN_IOASID_FD, ioasidfd)
This line is the problem.
[Historical aside: Alex's early drafts for the VFIO interface looked
quite similar to this. Ben Herrenschmidt and myself persuaded him it
was a bad idea, and groups were developed instead. I still think it's
a bad idea, and not just for POWER]
As Alex says, if this line fails because of the group restrictions,
that's not great because it's not very obvious what's gone wrong. But
IMO, the success path on a multi-device group is kind of worse:
you've now made made a meaningful and visible change to the setup of
devices which are not mentioned in this line *at all*. If you've
changed the DMA address space of this device you've also changed it
for everything else in the group - there's no getting around that.
For both those reasons, I absolutely agree with Alex that retaining
the explicit group model is valuable.
Yes, it makes set up more of a pain, but it's necessary complexity to
actually understand what's going on here.
> These two routes can have identical outcomes and identical security
> checks.
>
> In both cases if userspace wants a list of BDFs in the same group as
> the BDF it is interested in:
> readdir("/sys/bus/pci/devices/BDF/iommu_group/devices")
>
> It seems like a very small difference to me.
>
> I still don't see how the group restriction gets surfaced to the
> application through the group FD. The applications I looked through
> just treat the group FD as a step on their way to get the device_fd.
>
> Jason
>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Thu, Apr 22, 2021 at 11:13:37AM -0600, Alex Williamson wrote:
> On Wed, 21 Apr 2021 20:03:01 -0300
> Jason Gunthorpe <[email protected]> wrote:
>
> > On Wed, Apr 21, 2021 at 01:33:12PM -0600, Alex Williamson wrote:
> >
> > > > I still expect that VFIO_GROUP_SET_CONTAINER will be used to connect
> > > > /dev/{ioasid,vfio} to the VFIO group and all the group and device
> > > > logic stays inside VFIO.
> > >
> > > But that group and device logic is also tied to the container, where
> > > the IOMMU backend is the interchangeable thing that provides the IOMMU
> > > manipulation for that container.
> >
> > I think that is an area where the discussion would need to be focused.
> >
> > I don't feel very prepared to have it in details, as I haven't dug
> > into all the group and iommu micro-operation very much.
> >
> > But, it does seem like the security concept that VFIO is creating with
> > the group also has to be present in the lower iommu layer too.
> >
> > With different subsystems joining devices to the same ioasid's we
> > still have to enforce the security propery the vfio group is creating.
> >
> > > If you're using VFIO_GROUP_SET_CONTAINER to associate a group to a
> > > /dev/ioasid, then you're really either taking that group outside of
> > > vfio or you're re-implementing group management in /dev/ioasid.
> >
> > This sounds right.
> >
> > > > Everything can be switched to ioasid_container all down the line. If
> > > > it wasn't for PPC this looks fairly simple.
> > >
> > > At what point is it no longer vfio? I'd venture to say that replacing
> > > the container rather than invoking a different IOMMU backend is that
> > > point.
> >
> > sorry, which is no longer vfio?
>
> I'm suggesting that if we're replacing the container/group model with
> an ioasid then we're effectively creating a new thing that really only
> retains the vfio device uapi.
>
> > > > Since getting rid of PPC looks a bit hard, we'd be stuck with
> > > > accepting a /dev/ioasid and then immediately wrappering it in a
> > > > vfio_container an shimming it through a vfio_iommu_ops. It is not
> > > > ideal at all, but in my look around I don't see a major problem if
> > > > type1 implementation is moved to live under /dev/ioasid.
> > >
> > > But type1 is \just\ an IOMMU backend, not "/dev/vfio". Given that
> > > nobody flinched at removing NVLink support, maybe just deprecate SPAPR
> > > now and see if anyone objects ;)
> >
> > Would simplify this project, but I wonder :)
> >
> > In any event, it does look like today we'd expect the SPAPR stuff
> > would be done through the normal iommu APIs, perhaps enhanced a bit,
> > which makes me suspect an enhanced type1 can implement SPAPR.
>
> David Gibson has argued for some time that SPAPR could be handled via a
> converged type1 model. We has mapped that out at one point,
> essentially a "type2", but neither of us had any bandwidth to pursue it.
Right. The sPAPR TCE backend is kind of an unfortunate accident of
history. We absolutely could do a common interface, but no-one's had
time to work on it.
> > I say this because the SPAPR looks quite a lot like PASID when it has
> > APIs for allocating multiple tables and other things. I would be
> > interested to hear someone from IBM talk about what it is doing and
> > how it doesn't fit into today's IOMMU API.
Hm. I don't think it's really like PASID. Just like Type1, the TCE
backend represents a single DMA address space which all devices in the
container will see at all times. The difference is that there can be
multiple (well, 2) "windows" of valid IOVAs within that address space.
Each window can have a different TCE (page table) layout. For kernel
drivers, a smallish translated window at IOVA 0 is used for 32-bit
devices, and a large direct mapped (no page table) window is created
at a high IOVA for better performance with 64-bit DMA capable devices.
With the VFIO backend we create (but don't populate) a similar
smallish 32-bit window, userspace can create its own secondary window
if it likes, though obvious for userspace use there will always be a
page table. Userspace can choose the total size (but not address),
page size and to an extent the page table format of the created
window. Note that the TCE page table format is *not* the same as the
POWER CPU core's page table format. Userspace can also remove the
default small window and create its own.
The second wrinkle is pre-registration. That lets userspace register
certain userspace VA ranges (*not* IOVA ranges) as being the only ones
allowed to be mapped into the IOMMU. This is a performance
optimization, because on pre-registration we also pre-account memory
that will be effectively locked by DMA mappings, rather than doing it
at DMA map and unmap time.
This came about because POWER guests always contain a vIOMMU. That
(combined with the smallish default IOVA window) means that DMA maps
and unmaps can become an important bottleneck, rather than being
basically a small once-off cost when qemu maps all of guest memory
into the IOMMU. That's optimized with a special interlink between
KVM and VFIO that accelerates the guest-initiated maps/unmap
operations. However, it's not feasible to do the accounting in that
fast path, hence the need for the pre-registration.
>
> [Cc David, Alexey]
>
> > It is very old and the iommu world has advanced tremendously lately,
> > maybe I'm too optimisitic?
> >
> > > > We end up with a ioasid.h that basically has the vfio_iommu_type1 code
> > > > lightly recast into some 'struct iommu_container' and a set of
> > > > ioasid_* function entry points that follow vfio_iommu_driver_ops_type1:
> > > > ioasid_attach_group
> > > > ioasid_detatch_group
> > > > ioasid_<something about user pages>
> > > > ioasid_read/ioasid_write
> > >
> > > Again, this looks like a vfio IOMMU backend. What are we accomplishing
> > > by replacing /dev/vfio with /dev/ioasid versus some manipulation of
> > > VFIO_SET_IOMMU accepting a /dev/ioasid fd?
> >
> > The point of all of this is to make the user api for the IOMMU
> > cross-subsystem. It is not a vfio IOMMU backend, it is moving the
> > IOMMU abstraction from VFIO into the iommu framework and giving the
> > iommu framework a re-usable user API.
I like the idea of a common DMA/IOMMU handling system across
platforms. However in order to be efficiently usable for POWER it
will need to include multiple windows, allowing the user to change
those windows and something like pre-registration to amortize
accounting costs for heavy vIOMMU load.
Well... possibly we can do without the pre-reg now that 32-bit DMA
limited devics are less common, as are POWER8 systems. With modern
devices and modern kernels a guest is likely to use a single large
64-bit secondary window mapping all guest RAM, so the vIOMMU
bottleneck shouldn't be such an issue.
> Right, but I don't see that implies it cannot work within the vfio
> IOMMU model. Currently when an IOMMU is set, the /dev/vfio/vfio
> container becomes a conduit for file ops from the container to be
> forwarded to the IOMMU. But that's in part because the user doesn't
> have another object to interact with the IOMMU. It's entirely possible
> that with an ioasid shim, the user would continue to interact directly
> with the /dev/ioasid fd for IOMMU manipulation and only use
> VFIO_SET_IOMMU to associate a vfio container to that ioasid.
>
> > My ideal outcome would be for VFIO to use only the new iommu/ioasid
> > API and have no iommu pluggability at all. The iommu subsystem
> > provides everything needed to VFIO, and provides it equally to VDPA
> > and everything else.
>
> As above, we don't necessarily need to have the vfio container be the
> access mechanism for the IOMMU, it can become just an means to
> association the container with an IOMMU. This has quite a few
> transitional benefits.
>
> > drivers/vfio/ becomes primarily about 'struct vfio_device' and
> > everything related to its IOCTL interface.
> >
> > drivers/iommu and ioasid.c become all about a pluggable IOMMU
> > interface, including a uAPI for it.
> >
> > IMHO it makes a high level sense, though it may be a pipe dream.
>
> This is where we've dissolved all but the vfio device uapi, which
> suggests the group and container model were never necessary and I'm not
> sure exactly what that uapi looks like. We currently make use of an
> IOMMU api that is group aware, but that awareness extends out to the
> vfio uapi.
>
> > > > If we have this, and /dev/ioasid implements the legacy IOCTLs, then
> > > > /dev/vfio == /dev/ioasid and we can compile out vfio_fops and related
> > > > from vfio.c and tell ioasid.c to create /dev/vfio instead using the
> > > > ops it owns.
> > >
> > > Why would we want /dev/ioasid to implement legacy ioctls instead of
> > > simply implementing an interface to allow /dev/ioasid to be used as a
> > > vfio IOMMU backend?
> >
> > Only to make our own migration easier. I'd imagine everyone would want
> > to sit down and design this new clear ioasid API that can co-exist on
> > /dev/ioasid with the legacy once.
>
> vfio really just wants to be able to attach groups to an address space
> to consider them isolated, everything else about the IOMMU API could
> happen via a new ioasid file descriptor representing that context, ie.
> vfio handles the group ownership and device access, ioasid handles the
> actual mappings.
>
> > > The pseudo code above really suggests you do want to remove
> > > /dev/vfio/vfio, but this is only one of the IOMMU backends for vfio, so
> > > I can't quite figure out if we're talking past each other.
> >
> > I'm not quite sure what you mean by "one of the IOMMU backends?" You
> > mean type1, right?
> >
> > > As I expressed in another thread, type1 has a lot of shortcomings. The
> > > mapping interface leaves userspace trying desperately to use statically
> > > mapped buffers because the map/unmap latency is too high. We have
> > > horrible issues with duplicate locked page accounting across
> > > containers. It suffers pretty hard from feature creep in various
> > > areas. A new IOMMU backend is an opportunity to redesign some of these
> > > things.
> >
> > Sure, but also those kinds of transformational things go alot better
> > if you can smoothly go from the old to the new and have technical
> > co-existance in side the kernel. Having a shim that maps the old APIs
> > to new APIs internally to Linux helps keep the implementation from
> > becoming too bogged down with compatibility.
>
> I'm afraid /dev/ioasid providing type1 compatibility would be just that.
>
> > > The IOMMU group also abstracts isolation and visibility relative to
> > > DMA. For example, in a PCIe topology a multi-function device may not
> > > have isolation between functions, but each requester ID is visible to
> > > the IOMMU.
> >
> > Okay, I'm glad I have this all right in my head, as I was pretty sure
> > this was what the group was about.
> >
> > My next question is why do we have three things as a FD: group, device
> > and container (aka IOMMU interface)?
> >
> > Do we have container because the /dev/vfio/vfio can hold only a single
> > page table so we need to swap containers sometimes?
>
> The container represents an IOMMU address space, which can be shared by
> multiple groups, where each group may contain one or more devices.
> Swapping a container would require releasing all the devices (the user
> cannot have access to a non-isolated device), then a group could be
> moved from one container to another.
>
> > If we start from a clean sheet and make a sketch..
> >
> > /dev/ioasid is the IOMMU control interface. It can create multiple
> > IOASIDs that have page tables and it can manipulate those page tables.
> > Each IOASID is identified by some number.
> >
> > struct vfio_device/vdpa_device/etc are consumers of /dev/ioasid
> >
> > When a device attaches to an ioasid userspace gives VFIO/VDPA the
> > ioasid FD and the ioasid # in the FD.
> >
> > The security rule for isolation is that once a device is attached to a
> > /dev/ioasid fd then all other devices in that security group must be
> > attached to the same ioasid FD or left unused.
>
> Sounds like a group... Note also that if those other devices are not
> isolated from the user's device, the user could manipulate "unused"
> devices via DMA. So even unused devices should be within the same
> IOMMU context... thus attaching groups to IOMMU domains.
>
> > Thus /dev/ioasid also becomes the unit of security and the IOMMU
> > subsystem level becomes aware of and enforces the group security
> > rules. Userspace does not need to "see" the group
>
> What tools does userspace have to understand isolation of individual
> devices without groups?
>
> > In sketch it would be like
> > ioasid_fd = open("/dev/ioasid");
> > vfio_device_fd = open("/dev/vfio/device0")
> > vdpa_device_fd = open("/dev/vdpa/device0")
> > ioctl(vifo_device_fd, JOIN_IOASID_FD, ioasifd)
> > ioctl(vdpa_device_fd, JOIN_IOASID_FD, ioasifd)
> >
> > gpa_ioasid_id = ioctl(ioasid_fd, CREATE_IOASID, ..)
> > ioctl(ioasid_fd, SET_IOASID_PAGE_TABLES, ..)
> >
> > ioctl(vfio_device, ATTACH_IOASID, gpa_ioasid_id)
> > ioctl(vpda_device, ATTACH_IOASID, gpa_ioasid_id)
> >
> > .. both VDPA and VFIO see the guest physical map and the kernel has
> > enough info that both could use the same IOMMU page table
> > structure ..
> >
> > // Guest viommu turns off bypass mode for the vfio device
> > ioctl(vfio_device, DETATCH_IOASID)
> >
> > // Guest viommu creates a new page table
> > rid_ioasid_id = ioctl(ioasid_fd, CREATE_IOASID, ..)
> > ioctl(ioasid_fd, SET_IOASID_PAGE_TABLES, ..)
> >
> > // Guest viommu links the new page table to the RID
> > ioctl(vfio_device, ATTACH_IOASID, rid_ioasid_id)
> >
> > The group security concept becomes implicit and hidden from the
> > uAPI. JOIN_IOASID_FD implicitly finds the device's group inside the
> > kernel and requires that all members of the group be joined only to
> > this ioasid_fd.
> >
> > Essentially we discover the group from the device instead of the
> > device from the group.
> >
> > Where does it fall down compared to the three FD version we have
> > today?
>
> The group concept is explicit today because how does userspace learn
> about implicit dependencies between devices? For example, if the user
> has a conventional PCI bus with a couple devices on it, how do they
> understand that those devices cannot be assigned to separate userspace
> drivers? The group fd fills that gap. Thanks,
>
> Alex
>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Fri, Apr 23, 2021 at 07:28:03PM -0300, Jason Gunthorpe wrote:
> On Fri, Apr 23, 2021 at 10:38:51AM -0600, Alex Williamson wrote:
> > On Thu, 22 Apr 2021 20:39:50 -0300
>
> > > /dev/ioasid should understand the group concept somehow, otherwise it
> > > is incomplete and maybe even security broken.
> > >
> > > So, how do I add groups to, say, VDPA in a way that makes sense? The
> > > only answer I come to is broadly what I outlined here - make
> > > /dev/ioasid do all the group operations, and do them when we enjoin
> > > the VDPA device to the ioasid.
> > >
> > > Once I have solved all the groups problems with the non-VFIO users,
> > > then where does that leave VFIO? Why does VFIO need a group FD if
> > > everyone else doesn't?
> >
> > This assumes there's a solution for vDPA that doesn't just ignore the
> > problem and hope for the best. I can't speak to a vDPA solution.
>
> I don't think we can just ignore the question and succeed with
> /dev/ioasid.
>
> Guess it should get answered as best it can for ioasid "in general"
> then we can decide if it makes sense for VFIO to use the group FD or
> not when working in ioasid mode.
>
> Maybe a better idea will come up
>
> > an implicit restriction. You've listed a step in the description about
> > a "list of devices in the group", but nothing in the pseudo code
> > reflects that step.
>
> I gave it below with the readdir() - it isn't in the pseudo code
> because the applications I looked through didn't use it, and wouldn't
> benefit from it. I tried to show what things were doing today.
And chance are they will break cryptically if you give them a device
in a multi-device group. That's not something we want to encourage.
>
> > I expect it would be a subtly missed by any userspace driver
> > developer unless they happen to work on a system where the grouping
> > is not ideal.
>
> I'm still unclear - what are be the consequence if the application
> designer misses the group detail?
>
> Jason
>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Tue, Apr 27, 2021 at 03:11:25PM +1000, David Gibson wrote:
> > So your proposal sort of moves the entire container/group/domain
> > managment into /dev/ioasid and then leaves vfio only provide device
> > specific uAPI. An ioasid represents a page table (address space), thus
> > is equivalent to the scope of VFIO container.
>
> Right. I don't really know how /dev/iosasid is supposed to work, and
> so far I don't see how it conceptually differs from a container. What
> is it adding?
There are three motivating topics:
1) /dev/vfio/vfio is only usable by VFIO and we have many interesting
use cases now where we need the same thing usable outside VFIO
2) /dev/vfio/vfio does not support modern stuff like PASID and
updating to support that is going to be a big change, like adding
multiple IOASIDs so they can be modeled as as a tree inside a
single FD
3) I understand there is some desire to revise the uAPI here a bit,
ie Alex mentioned the poor mapping performance.
I would say it is not conceptually different from what VFIO calls a
container, it is just a different uAPI with the goal to be cross
subsystem.
Jason
On Tue, Apr 27, 2021 at 03:08:46PM +1000, David Gibson wrote:
> > Starting from a BDF the general pseudo code is:
> > device_name = first_directory_of("/sys/bus/pci/devices/BDF/vfio/")
> > device_fd = open("/dev/vfio/"+device_name)
> > ioasidfd = open("/dev/ioasid")
> > ioctl(device_fd, JOIN_IOASID_FD, ioasidfd)
>
> This line is the problem.
>
> [Historical aside: Alex's early drafts for the VFIO interface looked
> quite similar to this. Ben Herrenschmidt and myself persuaded him it
> was a bad idea, and groups were developed instead. I still think it's
> a bad idea, and not just for POWER]
Spawning the VFIO device FD from the group FD is incredibly gross from
a kernel design perspective. Since that was done the struct
vfio_device missed out on a sysfs presence and doesn't have the
typical 'struct device' member or dedicated char device you'd expect a
FD based subsystem to have.
This basically traded normal usage of the driver core for something
that doesn't serve a technical usage. Given we are now nearly 10 years
later and see that real widely deployed applications are not doing
anything special with the group FD it makes me question the wisdom of
this choice.
> As Alex says, if this line fails because of the group restrictions,
> that's not great because it's not very obvious what's gone wrong.
Okay, that is fair, but let's solve that problem directly. For
instance netlink has been going in the direction of adding a "extack"
from the kernel which is a descriptive error string. If the failing
ioctl returned the string:
"cannot join this device to the IOASID because device XXX in the
same group #10 is in use"
Would you agree it is now obvious what has gone wrong? In fact would
you agree this is a lot better user experience than what applications
do today even though they have the group FD?
> But IMO, the success path on a multi-device group is kind of worse:
> you've now made made a meaningful and visible change to the setup of
> devices which are not mentioned in this line *at all*.
I don't think spawning a single device_fd from the guoup clearly says
there are repercussions outside that immediate, single, device.
That comes from understanding what the ioctls are doing, and reading
the documentation. The same applies to some non-group FD world.
> Yes, it makes set up more of a pain, but it's necessary complexity to
> actually understand what's going on here.
There is a real technical problem here - the VFIO group is the thing
that spawns the device_fd and that is incompatible with the idea to
centralize the group security logic in drivers/iommu/ and share it
with multiple subsystems.
We also don't have an obvious clean way to incorporate a group FD into
other subsystems (nor would I want to).
One option is VFIO can keep its group FD but nothing else will have
anthing like it. However I don't much like the idea that VFIO will
have a special and unique programming model to do that same things
other subsystem will do. That will make it harder for userspace to
implement.
But again, lets see what the draft ioasid proposal looks like and
maybe someone will see a different solution.
Jason
On Tue, Apr 27, 2021 at 02:50:45PM +1000, David Gibson wrote:
> > > I say this because the SPAPR looks quite a lot like PASID when it has
> > > APIs for allocating multiple tables and other things. I would be
> > > interested to hear someone from IBM talk about what it is doing and
> > > how it doesn't fit into today's IOMMU API.
>
> Hm. I don't think it's really like PASID. Just like Type1, the TCE
> backend represents a single DMA address space which all devices in the
> container will see at all times. The difference is that there can be
> multiple (well, 2) "windows" of valid IOVAs within that address space.
> Each window can have a different TCE (page table) layout. For kernel
> drivers, a smallish translated window at IOVA 0 is used for 32-bit
> devices, and a large direct mapped (no page table) window is created
> at a high IOVA for better performance with 64-bit DMA capable devices.
>
> With the VFIO backend we create (but don't populate) a similar
> smallish 32-bit window, userspace can create its own secondary window
> if it likes, though obvious for userspace use there will always be a
> page table. Userspace can choose the total size (but not address),
> page size and to an extent the page table format of the created
> window. Note that the TCE page table format is *not* the same as the
> POWER CPU core's page table format. Userspace can also remove the
> default small window and create its own.
So what do you need from the generic API? I'd suggest if userspace
passes in the required IOVA range it would benefit all the IOMMU
drivers to setup properly sized page tables and PPC could use that to
drive a single window. I notice this is all DPDK did to support TCE.
> The second wrinkle is pre-registration. That lets userspace register
> certain userspace VA ranges (*not* IOVA ranges) as being the only ones
> allowed to be mapped into the IOMMU. This is a performance
> optimization, because on pre-registration we also pre-account memory
> that will be effectively locked by DMA mappings, rather than doing it
> at DMA map and unmap time.
This feels like nesting IOASIDs to me, much like a vPASID.
The pre-registered VA range would be the root of the tree and the
vIOMMU created ones would be children of the tree. This could allow
the map operations of the child to refer to already prepped physical
memory held in the root IOASID avoiding the GUP/etc cost.
Seems fairly genericish, though I'm not sure about the kvm linkage..
> I like the idea of a common DMA/IOMMU handling system across
> platforms. However in order to be efficiently usable for POWER it
> will need to include multiple windows, allowing the user to change
> those windows and something like pre-registration to amortize
> accounting costs for heavy vIOMMU load.
I have a feeling /dev/ioasid is going to end up with some HW specific
escape hatch to create some HW specific IOASID types and operate on
them in a HW specific way.
However, what I would like to see is that something simple like DPDK
can have a single implementation - POWER should implement the standard
operations and map them to something that will work for it.
As an ideal, only things like the HW specific qemu vIOMMU driver
should be reaching for all the special stuff.
In this way the kernel IOMMU driver and the qemu user vIOMMU driver
would form something of a classical split user/kernel driver pattern.
Jason
On Tue, Apr 27, 2021 at 01:39:54PM -0300, Jason Gunthorpe wrote:
> On Tue, Apr 27, 2021 at 03:11:25PM +1000, David Gibson wrote:
>
> > > So your proposal sort of moves the entire container/group/domain
> > > managment into /dev/ioasid and then leaves vfio only provide device
> > > specific uAPI. An ioasid represents a page table (address space), thus
> > > is equivalent to the scope of VFIO container.
> >
> > Right. I don't really know how /dev/iosasid is supposed to work, and
> > so far I don't see how it conceptually differs from a container. What
> > is it adding?
>
> There are three motivating topics:
> 1) /dev/vfio/vfio is only usable by VFIO and we have many interesting
> use cases now where we need the same thing usable outside VFIO
> 2) /dev/vfio/vfio does not support modern stuff like PASID and
> updating to support that is going to be a big change, like adding
> multiple IOASIDs so they can be modeled as as a tree inside a
> single FD
> 3) I understand there is some desire to revise the uAPI here a bit,
> ie Alex mentioned the poor mapping performance.
>
> I would say it is not conceptually different from what VFIO calls a
> container, it is just a different uAPI with the goal to be cross
> subsystem.
Ok, that makes sense.
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Tue, Apr 27, 2021 at 02:12:12PM -0300, Jason Gunthorpe wrote:
> On Tue, Apr 27, 2021 at 03:08:46PM +1000, David Gibson wrote:
> > > Starting from a BDF the general pseudo code is:
> > > device_name = first_directory_of("/sys/bus/pci/devices/BDF/vfio/")
> > > device_fd = open("/dev/vfio/"+device_name)
> > > ioasidfd = open("/dev/ioasid")
> > > ioctl(device_fd, JOIN_IOASID_FD, ioasidfd)
> >
> > This line is the problem.
> >
> > [Historical aside: Alex's early drafts for the VFIO interface looked
> > quite similar to this. Ben Herrenschmidt and myself persuaded him it
> > was a bad idea, and groups were developed instead. I still think it's
> > a bad idea, and not just for POWER]
>
> Spawning the VFIO device FD from the group FD is incredibly gross from
> a kernel design perspective. Since that was done the struct
> vfio_device missed out on a sysfs presence and doesn't have the
> typical 'struct device' member or dedicated char device you'd expect a
> FD based subsystem to have.
>
> This basically traded normal usage of the driver core for something
> that doesn't serve a technical usage. Given we are now nearly 10 years
> later and see that real widely deployed applications are not doing
> anything special with the group FD it makes me question the wisdom of
> this choice.
I'm really not sure what "anything special" would constitute here.
> > As Alex says, if this line fails because of the group restrictions,
> > that's not great because it's not very obvious what's gone wrong.
>
> Okay, that is fair, but let's solve that problem directly. For
> instance netlink has been going in the direction of adding a "extack"
> from the kernel which is a descriptive error string. If the failing
> ioctl returned the string:
>
> "cannot join this device to the IOASID because device XXX in the
> same group #10 is in use"
Um.. is there a sane way to return strings from an ioctl()?
> Would you agree it is now obvious what has gone wrong? In fact would
> you agree this is a lot better user experience than what applications
> do today even though they have the group FD?
>
> > But IMO, the success path on a multi-device group is kind of worse:
> > you've now made made a meaningful and visible change to the setup of
> > devices which are not mentioned in this line *at all*.
>
> I don't think spawning a single device_fd from the guoup clearly says
> there are repercussions outside that immediate, single, device.
It's not the fact that the device fds are spawed from the group fd.
It's the fact that the "attach" operation - binding the group to the
container now, binding the whatever to the iosasid in future -
explicitly takes a group. That's an operation that affects a group,
so the interface should reflect that.
Getting the device fds from the group fd kind of follows, because it's
unsafe to do basically anything on the device unless you already
control the group (which in this case means attaching it to a
container/ioasid). I'm entirely open to ways of doing that that are
less inelegant from a sysfs integration point of view, but the point
is you must manage the group before you can do anything at all with
individual devices.
> That comes from understanding what the ioctls are doing, and reading
> the documentation. The same applies to some non-group FD world.
>
> > Yes, it makes set up more of a pain, but it's necessary complexity to
> > actually understand what's going on here.
>
> There is a real technical problem here - the VFIO group is the thing
> that spawns the device_fd and that is incompatible with the idea to
> centralize the group security logic in drivers/iommu/ and share it
> with multiple subsystems.
I don't see why. I mean, sure, you don't want explicitly the *vfio*
group as such. But IOMMU group is already a cross-subsystem concept
and you can explicitly expose that in a different way.
> We also don't have an obvious clean way to incorporate a group FD into
> other subsystems (nor would I want to).
If you don't have a group concept in other subsystems, there's a fair
chance they are broken. There are a bunch of operations that are
inherently per-group. Well.. per container/IOASID, but the
granularity of membership for that is the group.
> One option is VFIO can keep its group FD but nothing else will have
> anthing like it. However I don't much like the idea that VFIO will
> have a special and unique programming model to do that same things
> other subsystem will do. That will make it harder for userspace to
> implement.
Again, I realy think this is necessary complexity. You're right that
far too little of the userspace properly understands group
restrictions.. but these come from real hardware limitations, and I
don't feel like making it *less* obvious in the interface is going to
help that.
> But again, lets see what the draft ioasid proposal looks like and
> maybe someone will see a different solution.
>
> Jason
>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Tue, Apr 27, 2021 at 02:24:32PM -0300, Jason Gunthorpe wrote:
> On Tue, Apr 27, 2021 at 02:50:45PM +1000, David Gibson wrote:
>
> > > > I say this because the SPAPR looks quite a lot like PASID when it has
> > > > APIs for allocating multiple tables and other things. I would be
> > > > interested to hear someone from IBM talk about what it is doing and
> > > > how it doesn't fit into today's IOMMU API.
> >
> > Hm. I don't think it's really like PASID. Just like Type1, the TCE
> > backend represents a single DMA address space which all devices in the
> > container will see at all times. The difference is that there can be
> > multiple (well, 2) "windows" of valid IOVAs within that address space.
> > Each window can have a different TCE (page table) layout. For kernel
> > drivers, a smallish translated window at IOVA 0 is used for 32-bit
> > devices, and a large direct mapped (no page table) window is created
> > at a high IOVA for better performance with 64-bit DMA capable devices.
> >
> > With the VFIO backend we create (but don't populate) a similar
> > smallish 32-bit window, userspace can create its own secondary window
> > if it likes, though obvious for userspace use there will always be a
> > page table. Userspace can choose the total size (but not address),
> > page size and to an extent the page table format of the created
> > window. Note that the TCE page table format is *not* the same as the
> > POWER CPU core's page table format. Userspace can also remove the
> > default small window and create its own.
>
> So what do you need from the generic API? I'd suggest if userspace
> passes in the required IOVA range it would benefit all the IOMMU
> drivers to setup properly sized page tables and PPC could use that to
> drive a single window. I notice this is all DPDK did to support TCE.
Yes. My proposed model for a unified interface would be that when you
create a new container/IOASID, *no* IOVAs are valid. Before you can
map anything you would have to create a window with specified base,
size, pagesize (probably some flags for extension, too). That could
fail if the backend IOMMU can't handle that IOVA range, it could be a
backend no-op if the requested window lies within a fixed IOVA range
the backend supports, or it could actually reprogram the back end for
the new window (such as for POWER TCEs). Regardless of the hardware,
attempts to map outside the created window(s) would be rejected by
software.
I expect we'd need some kind of query operation to expose limitations
on the number of windows, addresses for them, available pagesizes etc.
> > The second wrinkle is pre-registration. That lets userspace register
> > certain userspace VA ranges (*not* IOVA ranges) as being the only ones
> > allowed to be mapped into the IOMMU. This is a performance
> > optimization, because on pre-registration we also pre-account memory
> > that will be effectively locked by DMA mappings, rather than doing it
> > at DMA map and unmap time.
>
> This feels like nesting IOASIDs to me, much like a vPASID.
>
> The pre-registered VA range would be the root of the tree and the
> vIOMMU created ones would be children of the tree. This could allow
> the map operations of the child to refer to already prepped physical
> memory held in the root IOASID avoiding the GUP/etc cost.
Huh... I never thought of it that way, but yeah, that sounds like it
could work. More elegantly than the current system in fact.
> Seems fairly genericish, though I'm not sure about the kvm linkage..
I think it should be doable. We'd basically need to give KVM a handle
on the parent AS, and the child AS, and the guest side handle (what
PAPR calls a "Logical IO Bus Number" - liobn). KVM would then
translate H_PUT_TCE etc. hypercalls on that liobn into calls into the
IOMMU subsystem to map bits of the parent AS into the child. We'd
probably have to have some requirements that either parent AS is
identity-mapped to a subset of the userspace AS (effectively what we
have now) or that parent AS is the same as guest physical address.
Not sure which would work better.
> > I like the idea of a common DMA/IOMMU handling system across
> > platforms. However in order to be efficiently usable for POWER it
> > will need to include multiple windows, allowing the user to change
> > those windows and something like pre-registration to amortize
> > accounting costs for heavy vIOMMU load.
>
> I have a feeling /dev/ioasid is going to end up with some HW specific
> escape hatch to create some HW specific IOASID types and operate on
> them in a HW specific way.
>
> However, what I would like to see is that something simple like DPDK
> can have a single implementation - POWER should implement the standard
> operations and map them to something that will work for it.
>
> As an ideal, only things like the HW specific qemu vIOMMU driver
> should be reaching for all the special stuff.
I'm hoping we can even avoid that, usually. With the explicitly
created windows model I propose above, it should be able to: qemu will
create the windows according to the IOVA windows the guest platform
expects to see and they either will or won't work on the host platform
IOMMU. If they do, generic maps/unmaps should be sufficient. If they
don't well, the host IOMMU simply cannot emulate the vIOMMU so you're
out of luck anyway.
> In this way the kernel IOMMU driver and the qemu user vIOMMU driver
> would form something of a classical split user/kernel driver pattern.
>
> Jason
>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
> From: Jason Gunthorpe <[email protected]>
> Sent: Monday, April 26, 2021 8:38 PM
>
[...]
> > Want to hear your opinion for one open here. There is no doubt that
> > an ioasid represents a HW page table when the table is constructed by
> > userspace and then linked to the IOMMU through the bind/unbind
> > API. But I'm not very sure about whether an ioasid should represent
> > the exact pgtable or the mapping metadata when the underlying
> > pgtable is indirectly constructed through map/unmap API. VFIO does
> > the latter way, which is why it allows multiple incompatible domains
> > in a single container which all share the same mapping metadata.
>
> I think VFIO's map/unmap is way too complex and we know it has bad
> performance problems.
Can you or Alex elaborate where the complexity and performance problem
locate in VFIO map/umap? We'd like to understand more detail and see how
to avoid it in the new interface.
>
> If /dev/ioasid is single HW page table only then I would focus on that
> implementation and leave it to userspace to span different
> /dev/ioasids if needed.
>
> > OK, now I see where the disconnection comes from. In my context ioasid
> > is the identifier that is actually used in the wire, but seems you treat it as
> > a sw-defined namespace purely for representing page tables. We should
> > clear this concept first before further discussing other details. ????
>
> There is no general HW requirement that every IO page table be
> referred to by the same PASID and this API would have to support
Yes, but what is the value of allowing multiple PASIDs referring to the
the same I/O page table (except the nesting pgtable case)? Doesn't it
lead to poor iotlb efficiency issue similar to multiple iommu domains
referring to the same page table?
> non-PASID IO page tables as well. So I'd keep the two things
> separated in the uAPI - even though the kernel today has a global
> PASID pool.
for non-PASID usages the allocated PASID will be wasted if we don't
separate ioasid from pasid. But it may be worthwhile given 1m available
pasids and the simplification in the uAPI which only needs to care about
one id space then.
>
> > Then following your proposal, does it mean that we need another
> > interface for allocating PASID? and since ioasid means different
> > thing in uAPI and in-kernel API, possibly a new name is required to
> > avoid confusion?
>
> I would suggest have two ways to control the PASID
>
> 1) Over /dev/ioasid allocate a PASID for an IOASID. All future PASID
> based usages of the IOASID will use that global PASID
>
> 2) Over the device FD, when the IOASID is bound return the PASID that
> was selected. If the IOASID does not have a global PASID then the
> kernel is free to make something up. In this mode a single IOASID
> can have multiple PASIDs.
>
> Simple things like DPDK can use #2 and potentially have better PASID
> limits. hypervisors will most likely have to use #1, but it depends on
> how their vIOMMU interface works.
Can you elaborate why DPDK wants to use #2 i.e. not using a global
PASID?
>
> I think the name IOASID is fine for the uAPI, the kernel version can
> be called ioasid_id or something.
ioasid is already an id and then ioasid_id just adds confusion. Another
point is that ioasid is currently used to represent both PCI PASID and
ARM substream ID in the kernel. It implies that if we want to separate
ioasid and pasid in the uAPI the 'pasid' also needs to be replaced with
another general term usable for substream ID. Are we making the
terms too confusing here?
>
> (also looking at ioasid.c, why do we need such a thin and odd wrapper
> around xarray?)
>
I'll leave it to Jean and Jacob.
Thanks
Kevin
> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, April 28, 2021 1:12 AM
>
[...]
> > As Alex says, if this line fails because of the group restrictions,
> > that's not great because it's not very obvious what's gone wrong.
>
> Okay, that is fair, but let's solve that problem directly. For
> instance netlink has been going in the direction of adding a "extack"
> from the kernel which is a descriptive error string. If the failing
> ioctl returned the string:
>
> "cannot join this device to the IOASID because device XXX in the
> same group #10 is in use"
>
> Would you agree it is now obvious what has gone wrong? In fact would
> you agree this is a lot better user experience than what applications
> do today even though they have the group FD?
>
Currently all the discussions are around implicit vs. explicit uAPI semantics
on the group restriction. However if we look beyond group the implicit
semantics might be inevitable when dealing with incompatible iommu
domains. An existing example of iommu incompatibility is IOMMU_
CACHE. In the future there could be other incompatibilities such as
whether nested translation is supported. In the end the userspace has
to do some due diligence on understanding iommu topology and attributes
to decide how many VFIO containers or ioasid fds should be created. It
does push some burden to userspace but it's difficult to define a group-
like kernel object to enforce such restriction for iommu compatibility.
Then the best that the kernel can do is to return an informational error
message in case an incompatible device is attached to the existing domain.
If this is the perceived way to move forward anyway, I feel that removing
explicit group FD from uAPI doesn't make userspace worse...
Thanks
Kevin
> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, April 28, 2021 1:12 AM
>
[...]
> One option is VFIO can keep its group FD but nothing else will have
> anthing like it. However I don't much like the idea that VFIO will
> have a special and unique programming model to do that same things
> other subsystem will do. That will make it harder for userspace to
> implement.
Hi, Jason,
I have a question here. Based on discussions so far, it's clearly that the
new ioasid uAPI will differ from existing VFIO uAPI a lot, e.g. ioasid-
centric operations, no group fd, no incompatible domains, etc. Then
I wonder how we plan to support legacy VFIO applications in this
transition phase. Earlier you ever mentioned the desire of directly
replacing /dev/vfio/vfio with /dev/ioasid and having ioasid to present
both VFIO and new uAPI. Doesn't it imply that we have to copy the
VFIO container/group semantics into /dev/ioasid although it's a special
programming model only for VFIO?
Alternatively we could keep all the container/group legacy within VFIO
and having /dev/ioasid support only the new uAPI semantics. In this case
VFIO will include a shim iommu backend to connect its legacy uAPI into
drivers/ioasid backend functions for backward compatibility. Then VFIO
will also support a new model which only uses its device uAPI to bind
to new ioasid fd w/o using any legacy container/group/iommu uAPI.
Does this sound a plan?
Thanks
Kevin
On Wed, 28 Apr 2021 06:34:11 +0000
"Tian, Kevin" <[email protected]> wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Monday, April 26, 2021 8:38 PM
> >
> [...]
> > > Want to hear your opinion for one open here. There is no doubt that
> > > an ioasid represents a HW page table when the table is constructed by
> > > userspace and then linked to the IOMMU through the bind/unbind
> > > API. But I'm not very sure about whether an ioasid should represent
> > > the exact pgtable or the mapping metadata when the underlying
> > > pgtable is indirectly constructed through map/unmap API. VFIO does
> > > the latter way, which is why it allows multiple incompatible domains
> > > in a single container which all share the same mapping metadata.
> >
> > I think VFIO's map/unmap is way too complex and we know it has bad
> > performance problems.
>
> Can you or Alex elaborate where the complexity and performance problem
> locate in VFIO map/umap? We'd like to understand more detail and see how
> to avoid it in the new interface.
The map/unmap interface is really only good for long lived mappings,
the overhead is too high for things like vIOMMU use cases or any case
where the mapping is intended to be dynamic. Userspace drivers must
make use of a long lived buffer mapping in order to achieve performance.
The mapping and unmapping granularity has been a problem as well,
type1v1 allowed arbitrary unmaps to bisect the original mapping, with
the massive caveat that the caller relies on the return value of the
unmap to determine what was actually unmapped because the IOMMU use of
superpages is transparent to the caller. This led to type1v2 that
simply restricts the user to avoid ever bisecting mappings. That still
leaves us with problems for things like virtio-mem support where we
need to create initial mappings with a granularity that allows us to
later remove entries, which can prevent effective use of IOMMU
superpages.
Locked page accounting has been another constant issue. We perform
locked page accounting at the container level, where each container
accounts independently. A user may require multiple containers, the
containers may pin the same physical memory, but be accounted against
the user once per container.
Those are the main ones I can think of. It is nice to have a simple
map/unmap interface, I'd hope that a new /dev/ioasid interface wouldn't
raise the barrier to entry too high, but the user needs to have the
ability to have more control of their mappings and locked page
accounting should probably be offloaded somewhere. Thanks,
Alex
On Wed, Apr 28, 2021 at 10:58:29AM +1000, David Gibson wrote:
> On Tue, Apr 27, 2021 at 02:12:12PM -0300, Jason Gunthorpe wrote:
> > On Tue, Apr 27, 2021 at 03:08:46PM +1000, David Gibson wrote:
> > > > Starting from a BDF the general pseudo code is:
> > > > device_name = first_directory_of("/sys/bus/pci/devices/BDF/vfio/")
> > > > device_fd = open("/dev/vfio/"+device_name)
> > > > ioasidfd = open("/dev/ioasid")
> > > > ioctl(device_fd, JOIN_IOASID_FD, ioasidfd)
> > >
> > > This line is the problem.
> > >
> > > [Historical aside: Alex's early drafts for the VFIO interface looked
> > > quite similar to this. Ben Herrenschmidt and myself persuaded him it
> > > was a bad idea, and groups were developed instead. I still think it's
> > > a bad idea, and not just for POWER]
> >
> > Spawning the VFIO device FD from the group FD is incredibly gross from
> > a kernel design perspective. Since that was done the struct
> > vfio_device missed out on a sysfs presence and doesn't have the
> > typical 'struct device' member or dedicated char device you'd expect a
> > FD based subsystem to have.
> >
> > This basically traded normal usage of the driver core for something
> > that doesn't serve a technical usage. Given we are now nearly 10 years
> > later and see that real widely deployed applications are not doing
> > anything special with the group FD it makes me question the wisdom of
> > this choice.
>
> I'm really not sure what "anything special" would constitute here.
Well, really anything actually. All I see in, say, dpdk, is open the
group fd, get a device fd, do the container dance and never touch the
group fd again or care about groups in any way. It seems typical of
this class of application.
If dpdk is exposing other devices to a risk it certainly hasn't done
anything to make that obvious.
> > Okay, that is fair, but let's solve that problem directly. For
> > instance netlink has been going in the direction of adding a "extack"
> > from the kernel which is a descriptive error string. If the failing
> > ioctl returned the string:
> >
> > "cannot join this device to the IOASID because device XXX in the
> > same group #10 is in use"
>
> Um.. is there a sane way to return strings from an ioctl()?
Yes, it can be done, a string buffer pointer and length in the input
for instance.
> Getting the device fds from the group fd kind of follows, because it's
> unsafe to do basically anything on the device unless you already
> control the group (which in this case means attaching it to a
> container/ioasid). I'm entirely open to ways of doing that that are
> less inelegant from a sysfs integration point of view, but the point
> is you must manage the group before you can do anything at all with
> individual devices.
I think most likely VFIO is going to be the only thing to manage a
multi-device group.
I see things like VDPA being primarily about PASID, and an IOASID that
is matched to a PASID is inherently a single device IOMMU group.
> I don't see why. I mean, sure, you don't want explicitly the *vfio*
> group as such. But IOMMU group is already a cross-subsystem concept
> and you can explicitly expose that in a different way.
Yes, and no, the kernel drivers in something like VDPA have decided
what device and group they are in before we get to IOASID. It is
illogical to try to retro-actively bolt in a group concept to their
APIs.
> Again, I realy think this is necessary complexity. You're right that
> far too little of the userspace properly understands group
> restrictions.. but these come from real hardware limitations, and I
> don't feel like making it *less* obvious in the interface is going to
> help that.
The appeal of making it less obvious is we can have a single
simplified API flow so that an application that doesn't understand or
care about groups can have uniformity.
Jason
On Wed, Apr 28, 2021 at 07:47:56AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Wednesday, April 28, 2021 1:12 AM
> >
> [...]
> > One option is VFIO can keep its group FD but nothing else will have
> > anthing like it. However I don't much like the idea that VFIO will
> > have a special and unique programming model to do that same things
> > other subsystem will do. That will make it harder for userspace to
> > implement.
>
> Hi, Jason,
>
> I have a question here. Based on discussions so far, it's clearly that the
> new ioasid uAPI will differ from existing VFIO uAPI a lot, e.g. ioasid-
> centric operations, no group fd, no incompatible domains, etc. Then
> I wonder how we plan to support legacy VFIO applications in this
> transition phase.
I suspect the VFIO group fd will have to be registered with
/dev/ioasid in addition to each device if we are to retain the same
model.
> Earlier you ever mentioned the desire of directly replacing
> /dev/vfio/vfio with /dev/ioasid and having ioasid to present both
> VFIO and new uAPI. Doesn't it imply that we have to copy the VFIO
> container/group semantics into /dev/ioasid although it's a special
> programming model only for VFIO?
I gave that as a something to think about, if it doesn't work out then
it is just a bad idea to discard.
> Alternatively we could keep all the container/group legacy within VFIO
> and having /dev/ioasid support only the new uAPI semantics. In this case
> VFIO will include a shim iommu backend to connect its legacy uAPI into
> drivers/ioasid backend functions for backward compatibility. Then VFIO
> will also support a new model which only uses its device uAPI to bind
> to new ioasid fd w/o using any legacy container/group/iommu uAPI.
> Does this sound a plan?
It may be where we end up.. Though I fear it will make it overly
complex inside VFIO to access the new stuff. It would be very nice if
we could see a path where VFIO insides could only deal with the
in-kernel ioasid handles, whatever they are.
Jason
On Wed, Apr 28, 2021 at 06:34:11AM +0000, Tian, Kevin wrote:
> > If /dev/ioasid is single HW page table only then I would focus on that
> > implementation and leave it to userspace to span different
> > /dev/ioasids if needed.
> >
> > > OK, now I see where the disconnection comes from. In my context ioasid
> > > is the identifier that is actually used in the wire, but seems you treat it as
> > > a sw-defined namespace purely for representing page tables. We should
> > > clear this concept first before further discussing other details. ????
> >
> > There is no general HW requirement that every IO page table be
> > referred to by the same PASID and this API would have to support
>
> Yes, but what is the value of allowing multiple PASIDs referring to the
> the same I/O page table (except the nesting pgtable case)? Doesn't it
> lead to poor iotlb efficiency issue similar to multiple iommu domains
> referring to the same page table?
I think iotlb efficiency is up to the platform.
The general use case is to make an IOASID for something like the GPA
and use it concurrently with three say three devices:
- VFIO (not PASID)
- VDPA (PASID capable HW)
- 'Future VDPA storage' (PASID capable HW)
The uAPI for this should be very general and the kernel should decide
the optimal way to configure the HW. Maybe it is one page table and
one PASID, or maybe it is something else.
Allowing the kernel to choose the PASID once it knows the RID is the
highest generality.
> > non-PASID IO page tables as well. So I'd keep the two things
> > separated in the uAPI - even though the kernel today has a global
> > PASID pool.
>
> for non-PASID usages the allocated PASID will be wasted if we don't
> separate ioasid from pasid. But it may be worthwhile given 1m available
> pasids and the simplification in the uAPI which only needs to care about
> one id space then.
I'd prefer this be a platform choice and not forced in the uAPI,
because we can never go back on it if we see that yes we need to
optimize here. I understand many platforms have different available
PASID spaces already.
> > Simple things like DPDK can use #2 and potentially have better PASID
> > limits. hypervisors will most likely have to use #1, but it depends on
> > how their vIOMMU interface works.
>
> Can you elaborate why DPDK wants to use #2 i.e. not using a global
> PASID?
It gives the kernel an option to make the decision about the PASID
when it has the full information, including the RID.
> > I think the name IOASID is fine for the uAPI, the kernel version can
> > be called ioasid_id or something.
>
> ioasid is already an id and then ioasid_id just adds confusion. Another
> point is that ioasid is currently used to represent both PCI PASID and
> ARM substream ID in the kernel. It implies that if we want to separate
> ioasid and pasid in the uAPI the 'pasid' also needs to be replaced with
> another general term usable for substream ID. Are we making the
> terms too confusing here?
This is why I also am not so sure about exposing the PASID in the API
because it is ultimately a HW specific item.
As I said to David, one avenue is to have some generic uAPI that is
very general and keep all this deeply detailed stuff, that really only
matters for qemu, as part of a more HW specific vIOMMU driver
interface.
Jason
On Wed, Apr 28, 2021 at 11:23:39AM +1000, David Gibson wrote:
> Yes. My proposed model for a unified interface would be that when you
> create a new container/IOASID, *no* IOVAs are valid.
Hurm, it is quite tricky. All IOMMUs seem to have a dead zone around
the MSI window, so negotiating this all in a general way is not going
to be a very simple API.
To be general it would be nicer to say something like 'I need XXGB of
IOVA space' 'I need 32 bit IOVA space' etc and have the kernel return
ranges that sum up to at least that big. Then the kernel can do its
all its optimizations.
I guess you are going to say that the qemu PPC vIOMMU driver needs
more exact control..
> I expect we'd need some kind of query operation to expose limitations
> on the number of windows, addresses for them, available pagesizes etc.
Is page size an assumption that hugetlbfs will always be used for backing
memory or something?
> > As an ideal, only things like the HW specific qemu vIOMMU driver
> > should be reaching for all the special stuff.
>
> I'm hoping we can even avoid that, usually. With the explicitly
> created windows model I propose above, it should be able to: qemu will
> create the windows according to the IOVA windows the guest platform
> expects to see and they either will or won't work on the host platform
> IOMMU. If they do, generic maps/unmaps should be sufficient. If they
> don't well, the host IOMMU simply cannot emulate the vIOMMU so you're
> out of luck anyway.
It is not just P9 that has special stuff, and this whole area of PASID
seems to be quite different on every platform
If things fit very naturally and generally then maybe, but I've been
down this road before of trying to make a general description of a
group of very special HW. It ended in tears after 10 years when nobody
could understand the "general" API after it was Frankenstein'd up with
special cases for everything. Cautionary tale
There is a certain appeal to having some
'PPC_TCE_CREATE_SPECIAL_IOASID' entry point that has a wack of extra
information like windows that can be optionally called by the viommu
driver and it remains well defined and described.
Jason
On Wed, Apr 28, 2021 at 11:56:22AM -0300, Jason Gunthorpe wrote:
> On Wed, Apr 28, 2021 at 10:58:29AM +1000, David Gibson wrote:
> > On Tue, Apr 27, 2021 at 02:12:12PM -0300, Jason Gunthorpe wrote:
> > > On Tue, Apr 27, 2021 at 03:08:46PM +1000, David Gibson wrote:
> > > > > Starting from a BDF the general pseudo code is:
> > > > > device_name = first_directory_of("/sys/bus/pci/devices/BDF/vfio/")
> > > > > device_fd = open("/dev/vfio/"+device_name)
> > > > > ioasidfd = open("/dev/ioasid")
> > > > > ioctl(device_fd, JOIN_IOASID_FD, ioasidfd)
> > > >
> > > > This line is the problem.
> > > >
> > > > [Historical aside: Alex's early drafts for the VFIO interface looked
> > > > quite similar to this. Ben Herrenschmidt and myself persuaded him it
> > > > was a bad idea, and groups were developed instead. I still think it's
> > > > a bad idea, and not just for POWER]
> > >
> > > Spawning the VFIO device FD from the group FD is incredibly gross from
> > > a kernel design perspective. Since that was done the struct
> > > vfio_device missed out on a sysfs presence and doesn't have the
> > > typical 'struct device' member or dedicated char device you'd expect a
> > > FD based subsystem to have.
> > >
> > > This basically traded normal usage of the driver core for something
> > > that doesn't serve a technical usage. Given we are now nearly 10 years
> > > later and see that real widely deployed applications are not doing
> > > anything special with the group FD it makes me question the wisdom of
> > > this choice.
> >
> > I'm really not sure what "anything special" would constitute here.
>
> Well, really anything actually. All I see in, say, dpdk, is open the
> group fd, get a device fd, do the container dance and never touch the
> group fd again or care about groups in any way. It seems typical of
> this class of application.
Well, sure, the only operation you do on the group itself is attach it
to the container (and then every container operation can be thought of
as applying to all its attached groups). But that attach operation
really is fundamentally about the group. It always, unavoidably,
fundamentally affects every device in the group - including devices
you may not typically think about, like bridges and switches.
That is *not* true of the other device operations, like poking IO.
> If dpdk is exposing other devices to a risk it certainly hasn't done
> anything to make that obvious.
And in practice I suspect it will just break if you give it a >1
device group.
> > > Okay, that is fair, but let's solve that problem directly. For
> > > instance netlink has been going in the direction of adding a "extack"
> > > from the kernel which is a descriptive error string. If the failing
> > > ioctl returned the string:
> > >
> > > "cannot join this device to the IOASID because device XXX in the
> > > same group #10 is in use"
> >
> > Um.. is there a sane way to return strings from an ioctl()?
>
> Yes, it can be done, a string buffer pointer and length in the input
> for instance.
I suppose. Rare enough that I expect everyone will ignore it, alas :/.
> > Getting the device fds from the group fd kind of follows, because it's
> > unsafe to do basically anything on the device unless you already
> > control the group (which in this case means attaching it to a
> > container/ioasid). I'm entirely open to ways of doing that that are
> > less inelegant from a sysfs integration point of view, but the point
> > is you must manage the group before you can do anything at all with
> > individual devices.
>
> I think most likely VFIO is going to be the only thing to manage a
> multi-device group.
You don't get to choose that. You could explicitly limit other things
to only one-device groups, but that would be an explicit limitation.
Essentially any device can end up in a multi-device group, if you put
it behind a PCIe to PCI bridge, or a PCIe switch which doesn't support
access controls.
The groups are still there, whether or not other things want to deal
with them.
> I see things like VDPA being primarily about PASID, and an IOASID that
> is matched to a PASID is inherently a single device IOMMU group.
I don't know enough about PASID to make sense of that.
> > I don't see why. I mean, sure, you don't want explicitly the *vfio*
> > group as such. But IOMMU group is already a cross-subsystem concept
> > and you can explicitly expose that in a different way.
>
> Yes, and no, the kernel drivers in something like VDPA have decided
> what device and group they are in before we get to IOASID. It is
> illogical to try to retro-actively bolt in a group concept to their
> APIs.
Again, I don't know enough about VDPA to make sense of that. Are we
essentially talking non-PCI virtual devices here? In which case you
could define the VDPA "bus" to always have one-device groups.
> > Again, I realy think this is necessary complexity. You're right that
> > far too little of the userspace properly understands group
> > restrictions.. but these come from real hardware limitations, and I
> > don't feel like making it *less* obvious in the interface is going to
> > help that.
>
> The appeal of making it less obvious is we can have a single
> simplified API flow so that an application that doesn't understand or
> care about groups can have uniformity.
I don't think simplified-but-wrong is a good goal. The thing about
groups is that if they're there, you can't just "not care" about them,
they affect you whether you like it or not.
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Wed, Apr 28, 2021 at 09:21:49PM -0300, Jason Gunthorpe wrote:
> On Wed, Apr 28, 2021 at 11:23:39AM +1000, David Gibson wrote:
>
> > Yes. My proposed model for a unified interface would be that when you
> > create a new container/IOASID, *no* IOVAs are valid.
>
> Hurm, it is quite tricky. All IOMMUs seem to have a dead zone around
> the MSI window, so negotiating this all in a general way is not going
> to be a very simple API.
>
> To be general it would be nicer to say something like 'I need XXGB of
> IOVA space' 'I need 32 bit IOVA space' etc and have the kernel return
> ranges that sum up to at least that big. Then the kernel can do its
> all its optimizations.
Ah, yes, sorry. We do need an API that lets the kernel make more of
the decisions too. For userspace drivers it would generally be
sufficient to just ask for XXX size of IOVA space wherever you can get
it. Handling guests requires more precision. So, maybe a request
interface with a bunch of hint variables and a matching set of
MAP_FIXED-like flags to assert which ones aren't negotiable.
> I guess you are going to say that the qemu PPC vIOMMU driver needs
> more exact control..
*Every* vIOMMU driver needs more exact control. The guest drivers
will expect to program the guest devices with IOVAs matching the guest
platform's IOMMU model. Therefore the backing host IOMMU has to be
programmed to respond to those IOVAs. If it can't be, there's no way
around it, and you want to fail out early. With this model that will
happen when qemu (say) requests the host IOMMU window(s) to match the
guest's expected IOVA ranges.
Actually, come to that even guests without a vIOMMU need more exact
control: they'll expect IOVA to match GPA, so if your host IOMMU can't
be set up translate the full range of GPAs, again, you're out of luck.
The only reason x86 has been able to ignore this is that the
assumption has been that all IOMMUs can translate IOVAs from 0..<a big
enough number for any reasonable RAM size>. Once you really start to
look at what the limits are, you need the exact window control I'm
describing.
> > I expect we'd need some kind of query operation to expose limitations
> > on the number of windows, addresses for them, available pagesizes etc.
>
> Is page size an assumption that hugetlbfs will always be used for backing
> memory or something?
So for TCEs (and maybe other IOMMUs out there), the IO page tables are
independent of the CPU page tables. They don't have the same format,
and they don't necessarily have the same page size. In the case of a
bare metal kernel working in physical addresses they can use that TCE
page size however they like. For userspace you get another layer of
complexity. Essentially to implement things correctly the backing
IOMMU needs to have a page size granularity that's the minimum of
whatever granularity the userspace or guest driver expects and the
host page size backing the memory.
> > > As an ideal, only things like the HW specific qemu vIOMMU driver
> > > should be reaching for all the special stuff.
> >
> > I'm hoping we can even avoid that, usually. With the explicitly
> > created windows model I propose above, it should be able to: qemu will
> > create the windows according to the IOVA windows the guest platform
> > expects to see and they either will or won't work on the host platform
> > IOMMU. If they do, generic maps/unmaps should be sufficient. If they
> > don't well, the host IOMMU simply cannot emulate the vIOMMU so you're
> > out of luck anyway.
>
> It is not just P9 that has special stuff, and this whole area of PASID
> seems to be quite different on every platform
>
> If things fit very naturally and generally then maybe, but I've been
> down this road before of trying to make a general description of a
> group of very special HW. It ended in tears after 10 years when nobody
> could understand the "general" API after it was Frankenstein'd up with
> special cases for everything. Cautionary tale
>
> There is a certain appeal to having some
> 'PPC_TCE_CREATE_SPECIAL_IOASID' entry point that has a wack of extra
> information like windows that can be optionally called by the viommu
> driver and it remains well defined and described.
Windows really aren't ppc specific. They're absolutely there on x86
and everything else as well - it's just that people are used to having
a window at 0..<something largish> that you can often get away with
treating it sloppily.
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
Hi,
On 4/22/21 2:10 PM, Jason Gunthorpe wrote:
> On Thu, Apr 22, 2021 at 08:34:32AM +0000, Tian, Kevin wrote:
>
>> The shim layer could be considered as a new iommu backend in VFIO,
>> which connects VFIO iommu ops to the internal helpers in
>> drivers/ioasid.
>
> It may be the best we can do because of SPAPR, but the ideal outcome
> should be to remove the entire pluggable IOMMU stuff from vfio
> entirely and have it only use /dev/ioasid
>
> We should never add another pluggable IOMMU type to vfio - everything
> should be done through drives/iommu now that it is much more capable.
>
>> Another tricky thing is that a container may be linked to multiple iommu
>> domains in VFIO, as devices in the container may locate behind different
>> IOMMUs with inconsistent capability (commit 1ef3e2bc).
>
> Frankly this sounds over complicated. I would think /dev/ioasid should
> select the IOMMU when the first device is joined, and all future joins
> must be compatible with the original IOMMU - ie there is only one set
> of IOMMU capabilities in a /dev/ioasid.
>
> This means qemue might have multiple /dev/ioasid's if the system has
> multiple incompatible IOMMUs (is this actually a thing?) The platform
> should design its IOMMU domains to minimize the number of
> /dev/ioasid's required.
>
> Is there a reason we need to share IOASID'd between completely
> divergance IOMMU implementations? I don't expect the HW should be able
> to physically share page tables??
>
> That decision point alone might be the thing that just says we can't
> ever have /dev/vfio/vfio == /dev/ioasid
>
>> Just to confirm. Above flow is for current map/unmap flavor as what
>> VFIO/vDPA do today. Later when nested translation is supported,
>> there is no need to detach gpa_ioasid_fd. Instead, a new cmd will
>> be introduced to nest rid_ioasid_fd on top of gpa_ioasid_fd:
>
> Sure.. The tricky bit will be to define both of the common nested
> operating modes.
>
> nested_ioasid = ioctl(ioasid_fd, CREATE_NESTED_IOASID, gpa_ioasid_id);
> ioctl(ioasid_fd, SET_NESTED_IOASID_PAGE_TABLES, nested_ioasid, ..)
>
> // IOMMU will match on the device RID, no PASID:
> ioctl(vfio_device, ATTACH_IOASID, nested_ioasid);
>
> // IOMMU will match on the device RID and PASID:
> ioctl(vfio_device, ATTACH_IOASID_PASID, pasid, nested_ioasid);
>
> Notice that ATTACH (or bind, whatever) is always done on the
> vfio_device FD. ATTACH tells the IOMMU HW to link the PCI BDF&PASID to
> a specific page table defined by an IOASID.
>
> I expect we have many flavours of IOASID tables, eg we have normal,
> and 'nested with table controlled by hypervisor'. ARM has 'nested with
> table controlled by guest' right? So like this?
yes the PASID table is fully controlled by the guest Same for the stage
1 table.
>
> nested_ioasid = ioctl(ioasid_fd, CREATE_DELGATED_IOASID,
> gpa_ioasid_id, <some kind of viommu_id>)
> // PASID now goes to <viommu_id>
> ioctl(vfio_device, ATTACH_IOASID_PASID, pasid, nested_ioasid);
>
> Where <viommu_id> is some internal to the guest handle of the viommu
> page table scoped within gpa_ioasid_id? Like maybe it is GPA of the
> base of the page table?
Yes the GPA of the first level page table + some misc info like the max
number of IOASIDs.
>
> The guest can't select its own PASIDs without telling the hypervisor,
> right?
on ARM there is no system wide IOASID allocator as for x86. So the guest
can select its own PASID without telling the hyp.
Thanks
Eric
>
>> I also feel hiding group from uAPI is a good thing and is interested in
>> the rationale behind for explicitly managing group in vfio (which is
>> essentially the same boundary as provided by iommu group), e.g. for
>> better user experience when group security is broken?
>
> Indeed, I can see how things might have just evolved into this, but if
> it has a purpose it seems pretty hidden.
> we need it or not seems pretty hidden.
>
> Jason
>
Hi,
On 4/23/21 1:49 PM, Jason Gunthorpe wrote:
> On Fri, Apr 23, 2021 at 09:06:44AM +0000, Tian, Kevin wrote:
>
>> Or could we still have just one /dev/ioasid but allow userspace to create
>> multiple gpa_ioasid_id's each associated to a different iommu domain?
>> Then the compatibility check will be done at ATTACH_IOASID instead of
>> JOIN_IOASID_FD.
>
> To my mind what makes sense that that /dev/ioasid presents a single
> IOMMU behavior that is basically the same. This may ultimately not be
> what we call a domain today.
>
> We may end up with a middle object which is a group of domains that
> all have the same capabilities, and we define capabilities in a way
> that most platforms have a single group of domains.
>
> The key capability of a group of domains is they can all share the HW
> page table representation, so if an IOASID instantiates a page table
> it can be assigned to any device on any domain in the gruop of domains.
>
> If you try to say that /dev/ioasid has many domains and they can't
> have their HW page tables shared then I think the implementation
> complexity will explode.
>
>> This does impose one burden to userspace though, to understand the
>> IOMMU compatibilities and figure out which incompatible features may
>> affect the page table management (while such knowledge is IOMMU
>> vendor specific) and then explicitly manage multiple /dev/ioasid's or
>> multiple gpa_ioasid_id's.
>
> Right, this seems very hard in the general case..
>
>> Alternatively is it a good design by having the kernel return error at
>> attach/join time to indicate that incompatibility is detected then the
>> userspace should open a new /dev/ioasid or creates a new gpa_ioasid_id
>> for the failing device upon such failure, w/o constructing its own
>> compatibility knowledge?
>
> Yes, this feels workable too
>
>>> This means qemue might have multiple /dev/ioasid's if the system has
>>> multiple incompatible IOMMUs (is this actually a thing?) The platform
>>
>> One example is Intel platform with igd. Typically there is one IOMMU
>> dedicated for igd and the other IOMMU serving all the remaining devices.
>> The igd IOMMU may not support IOMMU_CACHE while the other one
>> does.
>
> If we can do as above the two domains may be in the same group of
> domains and the IOMMU_CACHE is not exposed at the /dev/ioasid level.
>
> For instance the API could specifiy IOMMU_CACHE during attach, not
> during IOASID creation.
>
> Getting all the data model right in the API is going to be trickiest
> part of this.
>
>> yes, e.g. in vSVA both devices (behind divergence IOMMUs) are bound
>> to a single guest process which has an unique PASID and 1st-level page
>> table. Earlier incompatibility example is only for 2nd-level.
>
> Because when we get to here, things become inscrutable as an API if
> you are trying to say two different IOMMU presentations can actually
> be nested.
>
>>> Sure.. The tricky bit will be to define both of the common nested
>>> operating modes.
>>>
>>> nested_ioasid = ioctl(ioasid_fd, CREATE_NESTED_IOASID, gpa_ioasid_id);
>>> ioctl(ioasid_fd, SET_NESTED_IOASID_PAGE_TABLES, nested_ioasid, ..)
>>>
>>> // IOMMU will match on the device RID, no PASID:
>>> ioctl(vfio_device, ATTACH_IOASID, nested_ioasid);
>>>
>>> // IOMMU will match on the device RID and PASID:
>>> ioctl(vfio_device, ATTACH_IOASID_PASID, pasid, nested_ioasid);
>>
>> I'm a bit confused here why we have both pasid and ioasid notations together.
>> Why not use nested_ioasid as pasid directly (i.e. every pasid in nested mode
>> is created by CREATE_NESTED_IOASID)?
>
> The IOASID is not a PASID, it is just a page table.
>
> A generic IOMMU matches on either RID or (RID,PASID), so you should
> specify the PASID when establishing the match.
>
> IOASID only specifies the page table.
>
> So you read the above as configuring the path
>
> PCI_DEVICE -> (RID,PASID) -> nested_ioasid -> gpa_ioasid_id -> physical
>
> Where (RID,PASID) indicate values taken from the PCI packet.
>
> In principle the IOMMU could also be commanded to reuse the same
> ioasid page table with a different PASID:
>
> PCI_DEVICE_B -> (RID_B,PASID_B) -> nested_ioasid -> gpa_ioasid_id -> physical
>
> This is impossible if the ioasid == PASID in the API.
>
>> Below I list different scenarios for ATTACH_IOASID in my view. Here
>> vfio_device could be a real PCI function (RID), or a subfunction device
>> (RID+def_ioasid).
>
> What is RID+def_ioasid? The IOMMU does not match on IOASID's.
>
> A subfunction device always need to use PASID, or an internal IOMMU,
> confused what you are trying to explain?
>
>> If the whole PASID table is delegated to the guest in ARM case, the guest
>> can select its own PASIDs w/o telling the hypervisor.
>
> The hypervisor has to route the PASID's to the guest at some point - a
> guest can't just claim a PASID unilaterally, that would not be secure.
AFAIU On ARM the stage 2 table is uniquely defined per RID and the PASID
space is local to the RID. in other words all PASIDs used along with a
given RID share the same stage 2 page table. So the minimum granularity
for guest assignment still is the RID and not the PASID (there is no
scalable mode on ARM). on x86, with scalable mode, each RID/PASID can
have different stage1/stage2 tables which make the minimum granularity
for guest device assignment the PASID.
So on ARM the guest can allocate PASIDs without interfering with the
host. We just need to make sure the PASID table format and max entries
matches the capability of the HW.
If the min granularity for guest assignment were the RID/PASID
effectively a guest could steal all the PASIDs for a given RID.
Thanks
Eric
>
> If it is not done with per-PASID hypercalls then the hypervisor has to
> route all PASID's for a RID to the guest and /dev/ioasid needs to have
> a nested IOASID object that represents this connection - ie it points
> to the PASID table of the guest vIOMMU or something.
>
> Remember this all has to be compatible with mdev's too and without
> hypercalls to create PASIDs that will be hard: mdev sharing a RID and
> slicing the physical PASIDs can't support a 'send all PASIDs to the
> guest' model, or even a 'the guest gets to pick the PASID' option.
>
> Jason
>
Hi,
On 4/22/21 2:10 PM, Jason Gunthorpe wrote:
> On Thu, Apr 22, 2021 at 08:34:32AM +0000, Tian, Kevin wrote:
>
>> The shim layer could be considered as a new iommu backend in VFIO,
>> which connects VFIO iommu ops to the internal helpers in
>> drivers/ioasid.
>
> It may be the best we can do because of SPAPR, but the ideal outcome
> should be to remove the entire pluggable IOMMU stuff from vfio
> entirely and have it only use /dev/ioasid
>
> We should never add another pluggable IOMMU type to vfio - everything
> should be done through drives/iommu now that it is much more capable.
>
>> Another tricky thing is that a container may be linked to multiple iommu
>> domains in VFIO, as devices in the container may locate behind different
>> IOMMUs with inconsistent capability (commit 1ef3e2bc).
>
> Frankly this sounds over complicated. I would think /dev/ioasid should
> select the IOMMU when the first device is joined, and all future joins
> must be compatible with the original IOMMU - ie there is only one set
> of IOMMU capabilities in a /dev/ioasid.
>
> This means qemue might have multiple /dev/ioasid's if the system has
> multiple incompatible IOMMUs (is this actually a thing?) The platform
> should design its IOMMU domains to minimize the number of
> /dev/ioasid's required.
>
> Is there a reason we need to share IOASID'd between completely
> divergance IOMMU implementations? I don't expect the HW should be able
> to physically share page tables??
>
> That decision point alone might be the thing that just says we can't
> ever have /dev/vfio/vfio == /dev/ioasid
>
>> Just to confirm. Above flow is for current map/unmap flavor as what
>> VFIO/vDPA do today. Later when nested translation is supported,
>> there is no need to detach gpa_ioasid_fd. Instead, a new cmd will
>> be introduced to nest rid_ioasid_fd on top of gpa_ioasid_fd:
>
> Sure.. The tricky bit will be to define both of the common nested
> operating modes.
>
From the pseudo code,
gpa_ioasid_id = ioctl(ioasid_fd, CREATE_IOASID, ..)
ioctl(ioasid_fd, SET_IOASID_PAGE_TABLES, ..)
I fail to understand whether the SET_IOASID_PAGE_TABLES would apply to
the whole IOASIDs within /dev/ioasid or to a specific one.
Also in subsequent emails when you talk about IOASID, is it the
ioasid_id, just to double check the terminology.
> nested_ioasid = ioctl(ioasid_fd, CREATE_NESTED_IOASID, gpa_ioasid_id);
> ioctl(ioasid_fd, SET_NESTED_IOASID_PAGE_TABLES, nested_ioasid, ..)
is the nested_ioasid the allocated PASID id or is it a complete
different object id.
>
> // IOMMU will match on the device RID, no PASID:
> ioctl(vfio_device, ATTACH_IOASID, nested_ioasid);
>
> // IOMMU will match on the device RID and PASID:
> ioctl(vfio_device, ATTACH_IOASID_PASID, pasid, nested_ioasid);
here I see you pass a different pasid, so I guess they are different, in
which case you would need to have an allocator function for this pasid,
right?
Thanks
Eric
>
> Notice that ATTACH (or bind, whatever) is always done on the
> vfio_device FD. ATTACH tells the IOMMU HW to link the PCI BDF&PASID to
> a specific page table defined by an IOASID.
>
> I expect we have many flavours of IOASID tables, eg we have normal,
> and 'nested with table controlled by hypervisor'. ARM has 'nested with
> table controlled by guest' right? So like this?
>
> nested_ioasid = ioctl(ioasid_fd, CREATE_DELGATED_IOASID,
> gpa_ioasid_id, <some kind of viommu_id>)
> // PASID now goes to <viommu_id>
> ioctl(vfio_device, ATTACH_IOASID_PASID, pasid, nested_ioasid);
>
> Where <viommu_id> is some internal to the guest handle of the viommu
> page table scoped within gpa_ioasid_id? Like maybe it is GPA of the
> base of the page table?
>
> The guest can't select its own PASIDs without telling the hypervisor,
> right?
>
>> I also feel hiding group from uAPI is a good thing and is interested in
>> the rationale behind for explicitly managing group in vfio (which is
>> essentially the same boundary as provided by iommu group), e.g. for
>> better user experience when group security is broken?
>
> Indeed, I can see how things might have just evolved into this, but if
> it has a purpose it seems pretty hidden.
> we need it or not seems pretty hidden.
>
> Jason
>
On Thu, Apr 29, 2021 at 03:26:55PM +0200, Auger Eric wrote:
> From the pseudo code,
>
> gpa_ioasid_id = ioctl(ioasid_fd, CREATE_IOASID, ..)
> ioctl(ioasid_fd, SET_IOASID_PAGE_TABLES, ..)
>
> I fail to understand whether the SET_IOASID_PAGE_TABLES would apply to
> the whole IOASIDs within /dev/ioasid or to a specific one.
Sorry, nearly every IOCTL would be scoped to a specific IOASID as one
of the arguments.
> Also in subsequent emails when you talk about IOASID, is it the
> ioasid_id, just to double check the terminology.
I am refering to IOASID as 'handle of the page table object inside the
/dev/ioasid fd'. If that is equal to some HW value or not I think
remains as decision point.
Basically the fd has an xarray of 'struct [something] *' and the
IOASID is index to that FD's private xarray. This is necessary to
create proper security as even if we have global PASID numbers or
something they still need to be isolated to only the FD that has
been authorized access.
> > nested_ioasid = ioctl(ioasid_fd, CREATE_NESTED_IOASID, gpa_ioasid_id);
> > ioctl(ioasid_fd, SET_NESTED_IOASID_PAGE_TABLES, nested_ioasid, ..)
> is the nested_ioasid the allocated PASID id or is it a complete
> different object id.
It is the IOASID handle above.
> >
> > // IOMMU will match on the device RID, no PASID:
> > ioctl(vfio_device, ATTACH_IOASID, nested_ioasid);
> >
> > // IOMMU will match on the device RID and PASID:
> > ioctl(vfio_device, ATTACH_IOASID_PASID, pasid, nested_ioasid);
> here I see you pass a different pasid, so I guess they are different, in
> which case you would need to have an allocator function for this pasid,
> right?
Yes, the underlying HW ID (PASID or substream id or whatver) is
something slightly different
Jason
On Thu, Apr 29, 2021 at 01:04:05PM +1000, David Gibson wrote:
> Again, I don't know enough about VDPA to make sense of that. Are we
> essentially talking non-PCI virtual devices here? In which case you
> could define the VDPA "bus" to always have one-device groups.
It is much worse than that.
What these non-PCI devices need is for the kernel driver to be part of
the IOMMU group of the underlying PCI device but tell VFIO land that
"groups don't matter"
Today mdev tries to fake this by using singleton iommu groups, but it
is really horrible and direcly hacks up the VFIO IOMMU code to
understand these special cases. Intel was proposing more special
hacking in the VFIO IOMMU code to extend this to PASID.
When we get to a /dev/ioasid this is all nonsense. The kernel device
driver is going to have to tell drivers/iommu exactly what kind of
ioasid it can accept, be it a PASID inside a kernel owned group, a SW
emulated 'mdev' ioasid, or whatever.
In these cases the "group" idea has become a fiction that just creates
a pain. "Just reorganize VDPA to do something insane with the driver
core so we can create a dummy group to satisfy an unnecessary uAPI
restriction" is not a very compelling argument.
So if the nonsensical groups goes away for PASID/mdev, where does it
leave the uAPI in other cases?
> I don't think simplified-but-wrong is a good goal. The thing about
> groups is that if they're there, you can't just "not care" about them,
> they affect you whether you like it or not.
You really can. If one thing claims the group then all the other group
devices become locked out.
The main point to understand is that groups are NOT an application
restriction! It is a whole system restriction that the operator needs
to understand and deal with. This is why things like dpdk don't care
about the group at all - there is nothing they can do with the
information.
If the operator says to run dpdk on a specific device then the
operator is the one that has to deal with all the other devices in the
group getting locked out.
At best the application can make it more obvious that the operator is
doing something dangerous, but the current kernel API doesn't seem to
really support that either.
Jason
On Thu, Apr 29, 2021 at 01:20:22PM +1000, David Gibson wrote:
> > There is a certain appeal to having some
> > 'PPC_TCE_CREATE_SPECIAL_IOASID' entry point that has a wack of extra
> > information like windows that can be optionally called by the viommu
> > driver and it remains well defined and described.
>
> Windows really aren't ppc specific. They're absolutely there on x86
> and everything else as well - it's just that people are used to having
> a window at 0..<something largish> that you can often get away with
> treating it sloppily.
My point is this detailed control seems to go on to more than just
windows. As you say the vIOMMU is emulating specific HW that needs to
have kernel interfaces to match it exactly.
I'm remarking that trying to unify every HW IOMMU implementation that
ever has/will exist into a generic API complete enough to allow the
vIOMMU to be created is likely to result in an API too complicated to
understand..
Jason
On Mon, May 03, 2021 at 01:05:30PM -0300, Jason Gunthorpe wrote:
> On Thu, Apr 29, 2021 at 01:20:22PM +1000, David Gibson wrote:
> > > There is a certain appeal to having some
> > > 'PPC_TCE_CREATE_SPECIAL_IOASID' entry point that has a wack of extra
> > > information like windows that can be optionally called by the viommu
> > > driver and it remains well defined and described.
> >
> > Windows really aren't ppc specific. They're absolutely there on x86
> > and everything else as well - it's just that people are used to having
> > a window at 0..<something largish> that you can often get away with
> > treating it sloppily.
>
> My point is this detailed control seems to go on to more than just
> windows. As you say the vIOMMU is emulating specific HW that needs to
> have kernel interfaces to match it exactly.
It's really not that bad. The case of emulating the PAPR vIOMMU on
something else is relatively easy, because all updates to the IO page
tables go through hypercalls. So, as long as the backend IOMMU can
map all the IOVAs that the guest IOMMU can, then qemu's implementation
of those hypercalls just needs to put an equivalent mapping in the
backend, which it can do with a generic VFIO_DMA_MAP.
vIOMMUs with page tables in guest memory are harder, but only really
in the usual ways that a vIOMMU of that type is harder (needs cache
mode or whatever). At whatever point you need to shadow from the
guest IO page tables to the host backend, you can again do that with
generic maps, as long as the backend supports the necessary IOVAs, and
has an IO page size that's equal to or a submultiple of the vIOMMU
page size.
> I'm remarking that trying to unify every HW IOMMU implementation that
> ever has/will exist into a generic API complete enough to allow the
> vIOMMU to be created is likely to result in an API too complicated to
> understand..
Maybe not every one, but I think we can get a pretty wide range with a
reasonable interface. Explicitly handling IOVA windows does most of
it. And we kind of need to handle that anyway to expose what ranges
the IOMMU is capable of translating anyway. I think making handling
valid IOVA windows explicit makes things simpler than having
per-backend-family interfaces to expose the limits of their
translation ranges, which is what's likely to happen without it.
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
Hi Jason,
On Wed, 28 Apr 2021 17:46:06 -0300, Jason Gunthorpe <[email protected]> wrote:
> > > I think the name IOASID is fine for the uAPI, the kernel version can
> > > be called ioasid_id or something.
> >
> > ioasid is already an id and then ioasid_id just adds confusion. Another
> > point is that ioasid is currently used to represent both PCI PASID and
> > ARM substream ID in the kernel. It implies that if we want to separate
> > ioasid and pasid in the uAPI the 'pasid' also needs to be replaced with
> > another general term usable for substream ID. Are we making the
> > terms too confusing here?
>
> This is why I also am not so sure about exposing the PASID in the API
> because it is ultimately a HW specific item.
>
> As I said to David, one avenue is to have some generic uAPI that is
> very general and keep all this deeply detailed stuff, that really only
> matters for qemu, as part of a more HW specific vIOMMU driver
> interface.
I think it is not just for QEMU. I am assuming you meant PASID is
needed for guest driver to program assigned but not mediated devices.
User space drivers may also need to get the real HW PASID to program it on
to the HW. So this uAPI need to provide some lookup functionality. Perhaps
the kernel generic version can be called ioasid_hw_id?
So we have the following per my understanding:
- IOASID: a userspace logical number which identifies a page table, this can
be a first level (GVA-GPA), or a second level (GPA->HPA) page table.
- PASID: strictly defined in PCIe term
- Substream ID: strictly defined in ARM SMMUv3 spec.
- IOASID_HW_ID: a generic ID backed by PASID, Substream ID, or any other
HW IDs used to tag DMA
Is that right?
Thanks,
Jacob
On Tue, May 04, 2021 at 09:22:55AM -0700, Jacob Pan wrote:
> Hi Jason,
>
> On Wed, 28 Apr 2021 17:46:06 -0300, Jason Gunthorpe <[email protected]> wrote:
>
> > > > I think the name IOASID is fine for the uAPI, the kernel version can
> > > > be called ioasid_id or something.
> > >
> > > ioasid is already an id and then ioasid_id just adds confusion. Another
> > > point is that ioasid is currently used to represent both PCI PASID and
> > > ARM substream ID in the kernel. It implies that if we want to separate
> > > ioasid and pasid in the uAPI the 'pasid' also needs to be replaced with
> > > another general term usable for substream ID. Are we making the
> > > terms too confusing here?
> >
> > This is why I also am not so sure about exposing the PASID in the API
> > because it is ultimately a HW specific item.
> >
> > As I said to David, one avenue is to have some generic uAPI that is
> > very general and keep all this deeply detailed stuff, that really only
> > matters for qemu, as part of a more HW specific vIOMMU driver
> > interface.
> I think it is not just for QEMU. I am assuming you meant PASID is
> needed for guest driver to program assigned but not mediated devices.
Anything that directly operates the device and tries to instantiate
PASIDs for vfio-pci devices will need to understand the PASID.
> User space drivers may also need to get the real HW PASID to program it on
> to the HW. So this uAPI need to provide some lookup functionality. Perhaps
> the kernel generic version can be called ioasid_hw_id?
>
> So we have the following per my understanding:
> - IOASID: a userspace logical number which identifies a page table, this can
> be a first level (GVA-GPA), or a second level (GPA->HPA) page table.
> - PASID: strictly defined in PCIe term
> - Substream ID: strictly defined in ARM SMMUv3 spec.
> - IOASID_HW_ID: a generic ID backed by PASID, Substream ID, or any other
> HW IDs used to tag DMA
>
> Is that right?
It is reasonable. If a IOASID_HW_ID IOCTL can back with a enum that
qualified its exact nature it might be perfectly fine.
Jason
Hi Kevin,
On Wed, 28 Apr 2021 06:34:11 +0000, "Tian, Kevin" <[email protected]>
wrote:
> >
> > (also looking at ioasid.c, why do we need such a thin and odd wrapper
> > around xarray?)
> >
>
> I'll leave it to Jean and Jacob.
I am not sure whether you are referring to the current ioasid.c or the
changes proposed in this patchset. I added a per VM/ioasid_set
(also per /dev/ioasid fd) xarray to store guest-host PASID mapping.
The current code has a xarray for the allocators.
struct ioasid_allocator_data {
struct ioasid_allocator_ops *ops;
struct list_head list;
struct list_head slist;
#define IOASID_ALLOCATOR_CUSTOM BIT(0) /* Needs framework to track results */
unsigned long flags;
struct xarray xa;
struct rcu_head rcu;
};
Could you elaborate?
Thanks,
Jacob
On Wed, Apr 28, 2021 at 06:58:19AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Wednesday, April 28, 2021 1:12 AM
> >
> [...]
> > > As Alex says, if this line fails because of the group restrictions,
> > > that's not great because it's not very obvious what's gone wrong.
> >
> > Okay, that is fair, but let's solve that problem directly. For
> > instance netlink has been going in the direction of adding a "extack"
> > from the kernel which is a descriptive error string. If the failing
> > ioctl returned the string:
> >
> > "cannot join this device to the IOASID because device XXX in the
> > same group #10 is in use"
> >
> > Would you agree it is now obvious what has gone wrong? In fact would
> > you agree this is a lot better user experience than what applications
> > do today even though they have the group FD?
> >
>
> Currently all the discussions are around implicit vs. explicit uAPI semantics
> on the group restriction. However if we look beyond group the implicit
> semantics might be inevitable when dealing with incompatible iommu
> domains. An existing example of iommu incompatibility is IOMMU_
> CACHE.
I still think we need to get rid of these incompatibilities
somehow. Having multiple HW incompatible IOASID in the same platform
is just bad all around.
When modeling in userspace IOMMU_CACHE sounds like it is a property of
each individual IOASID, not an attribute that requires a new domain.
People that want to create cache bypass IOASID's should just ask for
that that directly.
Jason
On Tue, May 04, 2021 at 08:41:48AM -0700, Jacob Pan wrote:
> > >
> > > (also looking at ioasid.c, why do we need such a thin and odd wrapper
> > > around xarray?)
> > >
> >
> > I'll leave it to Jean and Jacob.
> Could you elaborate?
I mean stuff like this:
int ioasid_set_data(ioasid_t ioasid, void *data)
{
struct ioasid_data *ioasid_data;
int ret = 0;
spin_lock(&ioasid_allocator_lock);
ioasid_data = xa_load(&active_allocator->xa, ioasid);
if (ioasid_data)
rcu_assign_pointer(ioasid_data->private, data);
else
ret = -ENOENT;
spin_unlock(&ioasid_allocator_lock);
/*
* Wait for readers to stop accessing the old private data, so the
* caller can free it.
*/
if (!ret)
synchronize_rcu();
return ret;
}
EXPORT_SYMBOL_GPL(ioasid_set_data);
It is a weird way to use xarray to have a structure which
itself is just a wrapper around another RCU protected structure.
Make the caller supply the ioasid_data memory, embedded in its own
element, get rid of the void * and rely on XA_ZERO_ENTRY to hold
allocated but not active entries.
Make the synchronize_rcu() the caller responsiblity, and callers
should really be able to use call_rcu()
Jason
On Tue, May 04, 2021 at 01:54:55PM +1000, David Gibson wrote:
> On Mon, May 03, 2021 at 01:05:30PM -0300, Jason Gunthorpe wrote:
> > On Thu, Apr 29, 2021 at 01:20:22PM +1000, David Gibson wrote:
> > > > There is a certain appeal to having some
> > > > 'PPC_TCE_CREATE_SPECIAL_IOASID' entry point that has a wack of extra
> > > > information like windows that can be optionally called by the viommu
> > > > driver and it remains well defined and described.
> > >
> > > Windows really aren't ppc specific. They're absolutely there on x86
> > > and everything else as well - it's just that people are used to having
> > > a window at 0..<something largish> that you can often get away with
> > > treating it sloppily.
> >
> > My point is this detailed control seems to go on to more than just
> > windows. As you say the vIOMMU is emulating specific HW that needs to
> > have kernel interfaces to match it exactly.
>
> It's really not that bad. The case of emulating the PAPR vIOMMU on
> something else is relatively easy, because all updates to the IO page
> tables go through hypercalls. So, as long as the backend IOMMU can
> map all the IOVAs that the guest IOMMU can, then qemu's implementation
> of those hypercalls just needs to put an equivalent mapping in the
> backend, which it can do with a generic VFIO_DMA_MAP.
So you also want the PAPR vIOMMU driver to run on, say, an ARM IOMMU?
> vIOMMUs with page tables in guest memory are harder, but only really
> in the usual ways that a vIOMMU of that type is harder (needs cache
> mode or whatever). At whatever point you need to shadow from the
> guest IO page tables to the host backend, you can again do that with
> generic maps, as long as the backend supports the necessary IOVAs, and
> has an IO page size that's equal to or a submultiple of the vIOMMU
> page size.
But this definitely all becomes HW specific.
For instance I want to have an ARM vIOMMU driver it needs to do some
ret = ioctl(ioasid_fd, CREATE_NESTED_IOASID, [page table format is ARMvXXX])
if (ret == -EOPNOTSUPP)
ret = ioctl(ioasid_fd, CREATE_NORMAL_IOASID, ..)
// and do completely different and more expensive emulation
I can get a little bit of generality, but at the end of the day the
IOMMU must create a specific HW layout of the nested page table, if it
can't, it can't.
> > I'm remarking that trying to unify every HW IOMMU implementation that
> > ever has/will exist into a generic API complete enough to allow the
> > vIOMMU to be created is likely to result in an API too complicated to
> > understand..
>
> Maybe not every one, but I think we can get a pretty wide range with a
> reasonable interface.
It sounds like a reasonable guideline is if the feature is actually
general to all IOMMUs and can be used by qemu as part of a vIOMMU
emulation when compatible vIOMMU HW is not available.
Having 'requested window' support that isn't actually implemented in
every IOMMU is going to mean the PAPR vIOMMU emulation won't work,
defeating the whole point of making things general?
Jason
Hi Jason,
On Tue, 4 May 2021 15:00:50 -0300, Jason Gunthorpe <[email protected]> wrote:
> On Tue, May 04, 2021 at 08:41:48AM -0700, Jacob Pan wrote:
> > > >
> > > > (also looking at ioasid.c, why do we need such a thin and odd
> > > > wrapper around xarray?)
> > > >
> > >
> > > I'll leave it to Jean and Jacob.
>
> > Could you elaborate?
>
> I mean stuff like this:
>
> int ioasid_set_data(ioasid_t ioasid, void *data)
> {
> struct ioasid_data *ioasid_data;
> int ret = 0;
>
> spin_lock(&ioasid_allocator_lock);
> ioasid_data = xa_load(&active_allocator->xa, ioasid);
> if (ioasid_data)
> rcu_assign_pointer(ioasid_data->private, data);
> else
> ret = -ENOENT;
> spin_unlock(&ioasid_allocator_lock);
>
> /*
> * Wait for readers to stop accessing the old private data, so the
> * caller can free it.
> */
> if (!ret)
> synchronize_rcu();
>
> return ret;
> }
> EXPORT_SYMBOL_GPL(ioasid_set_data);
>
> It is a weird way to use xarray to have a structure which
> itself is just a wrapper around another RCU protected structure.
>
> Make the caller supply the ioasid_data memory, embedded in its own
> element, get rid of the void * and rely on XA_ZERO_ENTRY to hold
> allocated but not active entries.
>
Let me try to paraphrase to make sure I understand. Currently
struct ioasid_data is private to the iasid core, its memory is allocated by
the ioasid core.
You are suggesting the following:
1. make struct ioasid_data public
2. caller allocates memory for ioasid_data, initialize it then pass it to
ioasid_alloc to store in the xarray
3. caller will be responsible for setting private data inside ioasid_data
and do call_rcu after update if needed.
Correct?
> Make the synchronize_rcu() the caller responsiblity, and callers
> should really be able to use call_rcu()
>
> Jason
Thanks,
Jacob
On Tue, May 04, 2021 at 03:11:54PM -0700, Jacob Pan wrote:
> > It is a weird way to use xarray to have a structure which
> > itself is just a wrapper around another RCU protected structure.
> >
> > Make the caller supply the ioasid_data memory, embedded in its own
> > element, get rid of the void * and rely on XA_ZERO_ENTRY to hold
> > allocated but not active entries.
> >
> Let me try to paraphrase to make sure I understand. Currently
> struct ioasid_data is private to the iasid core, its memory is allocated by
> the ioasid core.
>
> You are suggesting the following:
> 1. make struct ioasid_data public
> 2. caller allocates memory for ioasid_data, initialize it then pass it to
> ioasid_alloc to store in the xarray
> 3. caller will be responsible for setting private data inside ioasid_data
> and do call_rcu after update if needed.
Basically, but you probably won't need a "private data" once the
caller has this struct as it can just embed it in whatever larger
struct makes sense for it and use container_of/etc
I didn't look too closely at the whole thing though. Honestly I'm a
bit puzzled why we need a pluggable global allocator framework.. The
whole framework went to some trouble to isolate everything into iommu
drivers then that whole design is disturbed by this global thing.
Jason
On 05/05/2021 04:15, Jason Gunthorpe wrote:
> On Tue, May 04, 2021 at 01:54:55PM +1000, David Gibson wrote:
>> On Mon, May 03, 2021 at 01:05:30PM -0300, Jason Gunthorpe wrote:
>>> On Thu, Apr 29, 2021 at 01:20:22PM +1000, David Gibson wrote:
>>>>> There is a certain appeal to having some
>>>>> 'PPC_TCE_CREATE_SPECIAL_IOASID' entry point that has a wack of extra
>>>>> information like windows that can be optionally called by the viommu
>>>>> driver and it remains well defined and described.
>>>>
>>>> Windows really aren't ppc specific. They're absolutely there on x86
>>>> and everything else as well - it's just that people are used to having
>>>> a window at 0..<something largish> that you can often get away with
>>>> treating it sloppily.
>>>
>>> My point is this detailed control seems to go on to more than just
>>> windows. As you say the vIOMMU is emulating specific HW that needs to
>>> have kernel interfaces to match it exactly.
>>
>> It's really not that bad. The case of emulating the PAPR vIOMMU on
>> something else is relatively easy, because all updates to the IO page
>> tables go through hypercalls. So, as long as the backend IOMMU can
>> map all the IOVAs that the guest IOMMU can, then qemu's implementation
>> of those hypercalls just needs to put an equivalent mapping in the
>> backend, which it can do with a generic VFIO_DMA_MAP.
>
> So you also want the PAPR vIOMMU driver to run on, say, an ARM IOMMU?
This is a good feature in general when let's say there is a linux
supported device which has a proprietary device firmware update tool
which only exists as an x86 binary and your hardware is not x86 -
running qemu + vfio in full emulation would provide a way to run the
tool to update a physical device.
--
Alexey
Hi Jason,
On 4/29/21 10:04 PM, Jason Gunthorpe wrote:
> On Thu, Apr 29, 2021 at 03:26:55PM +0200, Auger Eric wrote:
>> From the pseudo code,
>>
>> gpa_ioasid_id = ioctl(ioasid_fd, CREATE_IOASID, ..)
>> ioctl(ioasid_fd, SET_IOASID_PAGE_TABLES, ..)
>>
>> I fail to understand whether the SET_IOASID_PAGE_TABLES would apply to
>> the whole IOASIDs within /dev/ioasid or to a specific one.
>
> Sorry, nearly every IOCTL would be scoped to a specific IOASID as one
> of the arguments.
OK thank you for the clarification.
>
>> Also in subsequent emails when you talk about IOASID, is it the
>> ioasid_id, just to double check the terminology.
>
> I am refering to IOASID as 'handle of the page table object inside the
> /dev/ioasid fd'. If that is equal to some HW value or not I think
> remains as decision point.
OK
>
> Basically the fd has an xarray of 'struct [something] *' and the
> IOASID is index to that FD's private xarray. This is necessary to
> create proper security as even if we have global PASID numbers or
> something they still need to be isolated to only the FD that has
> been authorized access.
>
>>> nested_ioasid = ioctl(ioasid_fd, CREATE_NESTED_IOASID, gpa_ioasid_id);
>>> ioctl(ioasid_fd, SET_NESTED_IOASID_PAGE_TABLES, nested_ioasid, ..)
>> is the nested_ioasid the allocated PASID id or is it a complete
>> different object id.
>
> It is the IOASID handle above.
ok as per the following emails and below comment IOASID and PASID are
different.The first would be a logic ID wgile the second the HW ID.
Thanks
Eric
>
>>>
>>> // IOMMU will match on the device RID, no PASID:
>>> ioctl(vfio_device, ATTACH_IOASID, nested_ioasid);
>>>
>>> // IOMMU will match on the device RID and PASID:
>>> ioctl(vfio_device, ATTACH_IOASID_PASID, pasid, nested_ioasid);
>> here I see you pass a different pasid, so I guess they are different, in
>> which case you would need to have an allocator function for this pasid,
>> right?
>
> Yes, the underlying HW ID (PASID or substream id or whatver) is
> something slightly different
>
> Jason
>
On Wed, May 05, 2021 at 02:28:53PM +1000, Alexey Kardashevskiy wrote:
> This is a good feature in general when let's say there is a linux supported
> device which has a proprietary device firmware update tool which only exists
> as an x86 binary and your hardware is not x86 - running qemu + vfio in full
> emulation would provide a way to run the tool to update a physical device.
That specific use case doesn't really need a vIOMMU though, does it?
Jason
Hi Jason,
On Tue, 4 May 2021 20:15:30 -0300, Jason Gunthorpe <[email protected]> wrote:
> On Tue, May 04, 2021 at 03:11:54PM -0700, Jacob Pan wrote:
>
> > > It is a weird way to use xarray to have a structure which
> > > itself is just a wrapper around another RCU protected structure.
> > >
> > > Make the caller supply the ioasid_data memory, embedded in its own
> > > element, get rid of the void * and rely on XA_ZERO_ENTRY to hold
> > > allocated but not active entries.
> > >
> > Let me try to paraphrase to make sure I understand. Currently
> > struct ioasid_data is private to the iasid core, its memory is
> > allocated by the ioasid core.
> >
> > You are suggesting the following:
> > 1. make struct ioasid_data public
> > 2. caller allocates memory for ioasid_data, initialize it then pass it
> > to ioasid_alloc to store in the xarray
> > 3. caller will be responsible for setting private data inside
> > ioasid_data and do call_rcu after update if needed.
>
> Basically, but you probably won't need a "private data" once the
> caller has this struct as it can just embed it in whatever larger
> struct makes sense for it and use container_of/etc
>
that makes sense. thanks!
> I didn't look too closely at the whole thing though. Honestly I'm a
> bit puzzled why we need a pluggable global allocator framework.. The
> whole framework went to some trouble to isolate everything into iommu
> drivers then that whole design is disturbed by this global thing.
>
Global and pluggable are for slightly separate reasons.
- We need global PASID on VT-d in that we need to support shared
workqueues (SWQ). E.g. One SWQ can be wrapped into two mdevs then assigned
to two VMs. Each VM uses its private guest PASID to submit work but
each guest PASID must be translated to a global (system-wide) host PASID to
avoid conflict. Also, since PASID table storage is per PF, if two mdevs of
the same PF are assigned to different VMs, the PASIDs must be unique.
- The pluggable allocator is to support the option where the guest PASIDs
are allocated by the hypervisor. Let it be the same as the host PASID or
some arbitrary number cooked up by the hypervisor but backed by a host HW
PASID. VT-d spec has this virtual command interface that requires the guest
to use it instead of allocating from the guest ioasid xarray. This is the
reason why it has to go down to iommu vendor driver. I guess that is what
you meant by "went to some trouble to isolate everything into iommu"?
For ARM, since the guest owns the per device PASID table. There is no need
to allocate PASIDs from the host nor the hypervisor. Without SWQ, there is
no need for global PASID/SSID either. So PASID being global for ARM is for
simplicity in case of host PASID/SSID.
> Jason
Thanks,
Jacob
On Wed, May 05, 2021 at 10:22:59AM -0700, Jacob Pan wrote:
> Global and pluggable are for slightly separate reasons.
> - We need global PASID on VT-d in that we need to support shared
> workqueues (SWQ). E.g. One SWQ can be wrapped into two mdevs then assigned
> to two VMs. Each VM uses its private guest PASID to submit work but
> each guest PASID must be translated to a global (system-wide) host PASID to
> avoid conflict. Also, since PASID table storage is per PF, if two mdevs of
> the same PF are assigned to different VMs, the PASIDs must be unique.
From a protocol perspective each RID has a unique PASID table, and
RIDs can have overlapping PASIDs.
Since your SWQ is connected to a single RID the requirement that
PASIDs are unique to the RID ensures they are sufficiently unique.
If the IOMMU driver has additional restrictions then it should raise
the PASID table up higher in the hierarchy than at the RID.
I think what you are trying to explain is that the Intel vIOMMU has a
single PASID address space shared globally by the vCPU because ENQCMD
uses the global vGPU translation table.
That is fine, but all this stuff should be inside the Intel vIOMMU
driver not made into a global resource of the entire iommu subsystem.
Systems that work this way just cannot have multiple iommu drivers
competing for PASID.
> - The pluggable allocator is to support the option where the guest PASIDs
> are allocated by the hypervisor.
And if the hypervisor allocates the PASID then again the specific
vIOMMU itself is concerned with this and it has nothing to do with
global behavior of the iommu subsystem.
> For ARM, since the guest owns the per device PASID table. There is no need
> to allocate PASIDs from the host nor the hypervisor. Without SWQ, there is
> no need for global PASID/SSID either. So PASID being global for ARM is for
> simplicity in case of host PASID/SSID.
It isn't clear how ARM can support PASID and mdev but that is an
unrelated issue..
Jason
Hi Jason,
On Wed, 5 May 2021 15:00:23 -0300, Jason Gunthorpe <[email protected]> wrote:
> On Wed, May 05, 2021 at 10:22:59AM -0700, Jacob Pan wrote:
>
> > Global and pluggable are for slightly separate reasons.
> > - We need global PASID on VT-d in that we need to support shared
> > workqueues (SWQ). E.g. One SWQ can be wrapped into two mdevs then
> > assigned to two VMs. Each VM uses its private guest PASID to submit
> > work but each guest PASID must be translated to a global (system-wide)
> > host PASID to avoid conflict. Also, since PASID table storage is per
> > PF, if two mdevs of the same PF are assigned to different VMs, the
> > PASIDs must be unique.
>
> From a protocol perspective each RID has a unique PASID table, and
> RIDs can have overlapping PASIDs.
>
True, per RID or per PF as I was referring to.
> Since your SWQ is connected to a single RID the requirement that
> PASIDs are unique to the RID ensures they are sufficiently unique.
>
True, but one process can submit work to multiple mdevs from different
RIDs/PFs. One process uses one PASID and PASID translation table is per VM.
The same PASID is used for all the PASID tables of each RID.
For example:
VM1 has two mdevs: mdev1 and mdev2. mdev1's parent is RID1, mdev2's parent
is RID2. The guest process A allocates PASID_A and bind to both mdev1 and
mdev2. PASID_A must be present in the PASID tables for both RID1 and RID2.
If the allocator is per RID, it is not possible to ensure PASID_A is
available for both RIDs. Right?
Sorry I missed this point in my earlier explanation.
> If the IOMMU driver has additional restrictions then it should raise
> the PASID table up higher in the hierarchy than at the RID.
>
That higher level in the hierarchy is global, right? I am a little
concerned about expanding PASID table sharing from security perspective.
Even though, VMs already share PASID table for mdevs.
> I think what you are trying to explain is that the Intel vIOMMU has a
> single PASID address space shared globally by the vCPU because ENQCMD
> uses the global vGPU translation table.
>
Yes, PASID translation table is per VM, global in terms of the guest.
That combined with the case of two mdevs from different RIDs can be used by
the same guest process/PASID requires global PASID.
> That is fine, but all this stuff should be inside the Intel vIOMMU
> driver not made into a global resource of the entire iommu subsystem.
>
Intel vIOMMU has to use a generic uAPI to allocate PASID so the generic
code need to have this option. I guess you are saying we should also have a
per RID allocation option in addition to global?
> Systems that work this way just cannot have multiple iommu drivers
> competing for PASID.
>
Sorry, I am not following. There would not be mixed iommu drivers on one
platform, I must have missed your point. Could you explain a little?
> > - The pluggable allocator is to support the option where the guest
> > PASIDs are allocated by the hypervisor.
>
> And if the hypervisor allocates the PASID then again the specific
> vIOMMU itself is concerned with this and it has nothing to do with
> global behavior of the iommu subsystem.
>
> > For ARM, since the guest owns the per device PASID table. There is no
> > need to allocate PASIDs from the host nor the hypervisor. Without SWQ,
> > there is no need for global PASID/SSID either. So PASID being global
> > for ARM is for simplicity in case of host PASID/SSID.
>
> It isn't clear how ARM can support PASID and mdev but that is an
> unrelated issue..
>
AFAIK, the current SMMU device assignment is per RID, since only one stage2
page tables per RID, not per PASID. This is equivalent to the older VT-d
spec. prior to scalable mode.
Eric/Jean, can you help?
> Jason
Thanks,
Jacob
On Wed, May 05, 2021 at 01:04:46PM -0700, Jacob Pan wrote:
> Hi Jason,
>
> On Wed, 5 May 2021 15:00:23 -0300, Jason Gunthorpe <[email protected]> wrote:
>
> > On Wed, May 05, 2021 at 10:22:59AM -0700, Jacob Pan wrote:
> >
> > > Global and pluggable are for slightly separate reasons.
> > > - We need global PASID on VT-d in that we need to support shared
> > > workqueues (SWQ). E.g. One SWQ can be wrapped into two mdevs then
> > > assigned to two VMs. Each VM uses its private guest PASID to submit
> > > work but each guest PASID must be translated to a global (system-wide)
> > > host PASID to avoid conflict. Also, since PASID table storage is per
> > > PF, if two mdevs of the same PF are assigned to different VMs, the
> > > PASIDs must be unique.
> >
> > From a protocol perspective each RID has a unique PASID table, and
> > RIDs can have overlapping PASIDs.
> >
> True, per RID or per PF as I was referring to.
>
> > Since your SWQ is connected to a single RID the requirement that
> > PASIDs are unique to the RID ensures they are sufficiently unique.
> >
> True, but one process can submit work to multiple mdevs from different
> RIDs/PFs. One process uses one PASID and PASID translation table is per VM.
> The same PASID is used for all the PASID tables of each RID.
If the model is "assign this PASID to this RID" then yes, there is a
big problem keeping everything straight that can only be solved with a
global table.
But if the model is "give me a PASID for this RID" then it isn't such
a problem.
Basically trying to enforce a uniform PASID for an IOASID across all
RIDs attached to it is not such a nice choice.
> > That is fine, but all this stuff should be inside the Intel vIOMMU
> > driver not made into a global resource of the entire iommu subsystem.
> >
> Intel vIOMMU has to use a generic uAPI to allocate PASID so the generic
> code need to have this option. I guess you are saying we should also have a
> per RID allocation option in addition to global?
There always has to be a RID involvement for the PASID, for security,
this issue really boils down to where the PASID lives.
If you need the PASID attached to the IOASID then it has to be global
because the IOASID can be attached to any RID and must keep the same
PASID.
If the PASID is learned when the IOASID is attached to a RID then the
PASID is more flexible and isn't attached to the IOASID.
Honestly I'm a little leary to bake into a UAPI a specific HW choice
that Intel made here.
I would advise making the "attach a global PASID to this IOASID"
operation explicit and opt into for case that actually need it.
Which implies the API to the iommu driver should be more like:
'assign an IOASID to this RID and return the PASID'
'reserve a PASID from every RID'
'assign an IOASID to this RID and use this specific PASID'
In all cases the scope of those operations are completely local to a
certain IOMMU driver - 'reserver a PASID from every RID' is really
every RID that driver can operate on.
So it is hard to see why the allocator should be a global resource and
not something that is part of the iommu driver exclusively.
Jason
On Wed, May 05, 2021 at 07:21:20PM -0300, Jason Gunthorpe wrote:
> On Wed, May 05, 2021 at 01:04:46PM -0700, Jacob Pan wrote:
> > Hi Jason,
> >
> > On Wed, 5 May 2021 15:00:23 -0300, Jason Gunthorpe <[email protected]> wrote:
> >
> > > On Wed, May 05, 2021 at 10:22:59AM -0700, Jacob Pan wrote:
> > >
> > > > Global and pluggable are for slightly separate reasons.
> > > > - We need global PASID on VT-d in that we need to support shared
> > > > workqueues (SWQ). E.g. One SWQ can be wrapped into two mdevs then
> > > > assigned to two VMs. Each VM uses its private guest PASID to submit
> > > > work but each guest PASID must be translated to a global (system-wide)
> > > > host PASID to avoid conflict. Also, since PASID table storage is per
> > > > PF, if two mdevs of the same PF are assigned to different VMs, the
> > > > PASIDs must be unique.
> > >
> > > From a protocol perspective each RID has a unique PASID table, and
> > > RIDs can have overlapping PASIDs.
> > >
> > True, per RID or per PF as I was referring to.
> >
> > > Since your SWQ is connected to a single RID the requirement that
> > > PASIDs are unique to the RID ensures they are sufficiently unique.
> > >
> > True, but one process can submit work to multiple mdevs from different
> > RIDs/PFs. One process uses one PASID and PASID translation table is per VM.
> > The same PASID is used for all the PASID tables of each RID.
>
> If the model is "assign this PASID to this RID" then yes, there is a
> big problem keeping everything straight that can only be solved with a
> global table.
>
> But if the model is "give me a PASID for this RID" then it isn't such
> a problem.
Correct, since we have usage with ENQCMD, its more like
- Give me a PASID1 (not attached to any RID)
- Bind/attach PASID1 with RID1
- Bind/attach PASID1 with RID2
and ENQCMD isn't just for Intel, with the DMWr spec in PCI, it brings it to
all devices as long as routing is supported by interim switches and such.
>
> Basically trying to enforce a uniform PASID for an IOASID across all
> RIDs attached to it is not such a nice choice.
>
> > > That is fine, but all this stuff should be inside the Intel vIOMMU
> > > driver not made into a global resource of the entire iommu subsystem.
> > >
> > Intel vIOMMU has to use a generic uAPI to allocate PASID so the generic
> > code need to have this option. I guess you are saying we should also have a
> > per RID allocation option in addition to global?
>
> There always has to be a RID involvement for the PASID, for security,
> this issue really boils down to where the PASID lives.
We do have a RID involvement with PASID always for security. Every RID has
its own PASID table, but the PASID name space is global.
So if you have RID1 associated with PASID1, another RID2 doesn't have the
PASID1 in its PASID table. Until when the app binds PASID1 with RID2 as
well. Then you have PASID1 plumbed in the PASID table for RID2.
Is this what you refer to for security?
>
> If you need the PASID attached to the IOASID then it has to be global
> because the IOASID can be attached to any RID and must keep the same
> PASID.
>
> If the PASID is learned when the IOASID is attached to a RID then the
> PASID is more flexible and isn't attached to the IOASID.
>
> Honestly I'm a little leary to bake into a UAPI a specific HW choice
> that Intel made here.
Like I mentioned, this isn't just Intel going forward. The specs are public
in PCIe. I just can't comment which other vendors are adopting it.
>
> I would advise making the "attach a global PASID to this IOASID"
> operation explicit and opt into for case that actually need it.
>
> Which implies the API to the iommu driver should be more like:
>
> 'assign an IOASID to this RID and return the PASID'
> 'reserve a PASID from every RID'
I don't think this has any decent change of success. Its rather round about
way to get a global PASID namespace.
> 'assign an IOASID to this RID and use this specific PASID'
This seems a bit complicated. Another way to specify this.
- IOASID is a logical construct to specify a page table.
- You can bind a global PASID to an IOASID
We aren't loosing any security by using a global PASID name space.
Until the application asks for it, that is not bound to any other RID without an explicit
request.
--
Cheers,
Ashok
On Wed, May 05, 2021 at 01:04:46PM -0700, Jacob Pan wrote:
> > > For ARM, since the guest owns the per device PASID table. There is no
> > > need to allocate PASIDs from the host nor the hypervisor. Without SWQ,
> > > there is no need for global PASID/SSID either. So PASID being global
> > > for ARM is for simplicity in case of host PASID/SSID.
> >
> > It isn't clear how ARM can support PASID and mdev but that is an
> > unrelated issue..
> >
> AFAIK, the current SMMU device assignment is per RID, since only one stage2
> page tables per RID, not per PASID. This is equivalent to the older VT-d
> spec. prior to scalable mode.
Yes that's right. Since SMMUv3 has a single level-2 page table per RID, it
doesn't support assigning level-1 page tables to guests for mdevs (sub-VF
devices). So no PASIDs for mdevs, which also means each guest has its own
PASID space and the host doesn't track guest PASIDs.
Thanks,
Jean
On Wed, May 05, 2021 at 04:23:19PM -0700, Raj, Ashok wrote:
> > Which implies the API to the iommu driver should be more like:
> >
> > 'assign an IOASID to this RID and return the PASID'
> > 'reserve a PASID from every RID'
>
> I don't think this has any decent change of success. Its rather round about
> way to get a global PASID namespace.
>
> > 'assign an IOASID to this RID and use this specific PASID'
>
> This seems a bit complicated. Another way to specify this.
Maybe, but I don't like that the driver-based iommu API has been
corrupted by injecting a global 'first driver to claim it'
resource. It is not properly layered anymore.
Jason
On Thu, May 06, 2021 at 09:23:48AM +0200, Jean-Philippe Brucker wrote:
> On Wed, May 05, 2021 at 01:04:46PM -0700, Jacob Pan wrote:
> > > > For ARM, since the guest owns the per device PASID table. There is no
> > > > need to allocate PASIDs from the host nor the hypervisor. Without SWQ,
> > > > there is no need for global PASID/SSID either. So PASID being global
> > > > for ARM is for simplicity in case of host PASID/SSID.
> > >
> > > It isn't clear how ARM can support PASID and mdev but that is an
> > > unrelated issue..
> > >
> > AFAIK, the current SMMU device assignment is per RID, since only one stage2
> > page tables per RID, not per PASID. This is equivalent to the older VT-d
> > spec. prior to scalable mode.
>
> Yes that's right. Since SMMUv3 has a single level-2 page table per RID, it
> doesn't support assigning level-1 page tables to guests for mdevs (sub-VF
> devices). So no PASIDs for mdevs, which also means each guest has its own
> PASID space and the host doesn't track guest PASIDs.
Basically it means when the guest's top level IOASID is created for
nesting that IOASID claims all PASID's on the RID and excludes any
PASID IOASIDs from existing on the RID now or in future.
Which would be a different behavior than something like Intel's top
level IOASID that doesn't claim all the PASIDs.
Lots of little special flags in here :|
Jason
Hi Jason
On Thu, May 06, 2021 at 09:27:30AM -0300, Jason Gunthorpe wrote:
> On Thu, May 06, 2021 at 09:23:48AM +0200, Jean-Philippe Brucker wrote:
> > On Wed, May 05, 2021 at 01:04:46PM -0700, Jacob Pan wrote:
> > > > > For ARM, since the guest owns the per device PASID table. There is no
> > > > > need to allocate PASIDs from the host nor the hypervisor. Without SWQ,
> > > > > there is no need for global PASID/SSID either. So PASID being global
> > > > > for ARM is for simplicity in case of host PASID/SSID.
> > > >
> > > > It isn't clear how ARM can support PASID and mdev but that is an
> > > > unrelated issue..
> > > >
> > > AFAIK, the current SMMU device assignment is per RID, since only one stage2
> > > page tables per RID, not per PASID. This is equivalent to the older VT-d
> > > spec. prior to scalable mode.
> >
> > Yes that's right. Since SMMUv3 has a single level-2 page table per RID, it
> > doesn't support assigning level-1 page tables to guests for mdevs (sub-VF
> > devices). So no PASIDs for mdevs, which also means each guest has its own
> > PASID space and the host doesn't track guest PASIDs.
>
> Basically it means when the guest's top level IOASID is created for
> nesting that IOASID claims all PASID's on the RID and excludes any
> PASID IOASIDs from existing on the RID now or in future.
The way to look at it this is as follows:
For platforms that do not have a need to support shared work queue model
support for ENQCMD or similar, PASID space is naturally per RID. There is no
complication with this. Every RID has the full range of PASID's and no need
for host to track which PASIDs are allocated now or in future in the guest.
For platforms that support ENQCMD, it is required to mandate PASIDs are
global across the entire system. Maybe its better to call them gPASID for
guest and hPASID for host. Short reason being gPASID->hPASID is a guest
wide mapping for ENQCMD and not a per-RID based mapping. (We covered that
in earlier responses)
In our current implementation we actually don't separate this space, and
gPASID == hPASID. The iommu driver enforces that by using the custom
allocator and the architected interface that allows all guest vIOMMU
allocations to be proxied to host. Nothing but a glorified hypercall like
interface. In fact some OS's do use hypercall to get a hPASID vs using
the vCMD style interface.
For cases where there is full PASID range for every RID and completely
managed by the guest that requires no assist from host to ensure
uniqueness, they don't need to have a custom allocator. Maybe the general
allocator can have ways to ensure global uniqueness vs. RID wide
uniqueness. This is still managed by the iommu driver (vIOMMU) + the
backend for vCMD in the host IOMMU driver.
>
> Which would be a different behavior than something like Intel's top
> level IOASID that doesn't claim all the PASIDs.
isn't this simple, if we can say ioasid allocator can provide
- system wide PASID
- RID local PASID
Based on platform capabilities that require such differentiation?
And based on the other threads, if ioasid is just a pgtable representation,
it doesn't need a PASID per-se. But when you want to use SVM or such, you
can associate a PASID with it for the IOMMU to plumb things with hardware.
Cheers,
Ashok
> From: Alex Williamson <[email protected]>
> Sent: Wednesday, April 28, 2021 11:06 PM
>
> On Wed, 28 Apr 2021 06:34:11 +0000
> "Tian, Kevin" <[email protected]> wrote:
>
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Monday, April 26, 2021 8:38 PM
> > >
> > [...]
> > > > Want to hear your opinion for one open here. There is no doubt that
> > > > an ioasid represents a HW page table when the table is constructed by
> > > > userspace and then linked to the IOMMU through the bind/unbind
> > > > API. But I'm not very sure about whether an ioasid should represent
> > > > the exact pgtable or the mapping metadata when the underlying
> > > > pgtable is indirectly constructed through map/unmap API. VFIO does
> > > > the latter way, which is why it allows multiple incompatible domains
> > > > in a single container which all share the same mapping metadata.
> > >
> > > I think VFIO's map/unmap is way too complex and we know it has bad
> > > performance problems.
> >
> > Can you or Alex elaborate where the complexity and performance problem
> > locate in VFIO map/umap? We'd like to understand more detail and see
> how
> > to avoid it in the new interface.
>
>
> The map/unmap interface is really only good for long lived mappings,
> the overhead is too high for things like vIOMMU use cases or any case
> where the mapping is intended to be dynamic. Userspace drivers must
> make use of a long lived buffer mapping in order to achieve performance.
This is not a limitation of VFIO map/unmap. It's the limitation of any
map/unmap semantics since the fact of long-lived vs. short-lived is
imposed by userspace. Nested translation is the only viable optimization
allowing 2nd-level to be a long-lived mapping even w/ vIOMMU. From
this angle I'm not sure how a new map/unmap implementation could
address this perf limitation alone.
>
> The mapping and unmapping granularity has been a problem as well,
> type1v1 allowed arbitrary unmaps to bisect the original mapping, with
> the massive caveat that the caller relies on the return value of the
> unmap to determine what was actually unmapped because the IOMMU use
> of
> superpages is transparent to the caller. This led to type1v2 that
> simply restricts the user to avoid ever bisecting mappings. That still
> leaves us with problems for things like virtio-mem support where we
> need to create initial mappings with a granularity that allows us to
> later remove entries, which can prevent effective use of IOMMU
> superpages.
We could start with a semantics similar to type1v2.
btw why does virtio-mem require a smaller granularity? Can we split
superpages in-the-fly when removal actually happens (just similar
to page split in VM live migration for efficient dirty page tracking)?
and isn't it another problem imposed by userspace? How could a new
map/unmap implementation mitigate this problem if the userspace
insists on a smaller granularity for initial mappings?
>
> Locked page accounting has been another constant issue. We perform
> locked page accounting at the container level, where each container
> accounts independently. A user may require multiple containers, the
> containers may pin the same physical memory, but be accounted against
> the user once per container.
for /dev/ioasid there is still an open whether an process is allowed to
open /dev/ioasid once or multiple times. If there is only one ioasid_fd
per process, the accounting can be made accurately. otherwise the
same problem still exists as each ioasid_fd is akin to the container, then
we need find a better solution.
>
> Those are the main ones I can think of. It is nice to have a simple
> map/unmap interface, I'd hope that a new /dev/ioasid interface wouldn't
> raise the barrier to entry too high, but the user needs to have the
> ability to have more control of their mappings and locked page
> accounting should probably be offloaded somewhere. Thanks,
>
Based on your feedbacks I feel it's probably reasonable to start with
a type1v2 semantics for the new interface. Locked accounting could
also start with the same VFIO restriction and then improve it
incrementally, if a cleaner way is intrusive (if not affecting uAPI).
But I didn't get the suggestion on "more control of their mappings".
Can you elaborate?
Thanks
Kevin
> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, May 5, 2021 1:13 AM
>
> On Wed, Apr 28, 2021 at 06:58:19AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Wednesday, April 28, 2021 1:12 AM
> > >
> > [...]
> > > > As Alex says, if this line fails because of the group restrictions,
> > > > that's not great because it's not very obvious what's gone wrong.
> > >
> > > Okay, that is fair, but let's solve that problem directly. For
> > > instance netlink has been going in the direction of adding a "extack"
> > > from the kernel which is a descriptive error string. If the failing
> > > ioctl returned the string:
> > >
> > > "cannot join this device to the IOASID because device XXX in the
> > > same group #10 is in use"
> > >
> > > Would you agree it is now obvious what has gone wrong? In fact would
> > > you agree this is a lot better user experience than what applications
> > > do today even though they have the group FD?
> > >
> >
> > Currently all the discussions are around implicit vs. explicit uAPI semantics
> > on the group restriction. However if we look beyond group the implicit
> > semantics might be inevitable when dealing with incompatible iommu
> > domains. An existing example of iommu incompatibility is IOMMU_
> > CACHE.
>
> I still think we need to get rid of these incompatibilities
> somehow. Having multiple HW incompatible IOASID in the same platform
> is just bad all around.
>
> When modeling in userspace IOMMU_CACHE sounds like it is a property of
> each individual IOASID, not an attribute that requires a new domain.
sure. the iommu domain is an kernel-internal concept. The userspace
should focus everything on IOASID.
>
> People that want to create cache bypass IOASID's should just ask for
> that that directly.
>
Yes, in earlier discussion we agreed on a scheme that ioasid module
will return an error to userspace indicating incompatibility detected
when binding a device to ioasid then the userspace should create
a new IOASID for this device. This has to be done 'explicitly'.
When I used it as the example for 'implicit semantics" is that the kernel
won't create another group-like object to contain devices with compatible
attributes and 'explicitly' manage it in uAPI like group_fd. If we anyway
rely on the userspace to have more intelligence on those hardware
restrictions, it's little sense to only explicitly handle group_fd in uAPI.
Thanks
Kevin
On Fri, May 07, 2021 at 07:36:49AM +0000, Tian, Kevin wrote:
> for /dev/ioasid there is still an open whether an process is allowed to
> open /dev/ioasid once or multiple times. If there is only one ioasid_fd
> per process, the accounting can be made accurately. otherwise the
> same problem still exists as each ioasid_fd is akin to the container, then
> we need find a better solution.
You can't really do tricks like 'FD once per process' in linux.
The locked page accounting problem is much bigger than vfio and I
don't really know of any solution..
Jason
On Fri, May 07, 2021 at 11:06:14AM -0600, Alex Williamson wrote:
> We had tossed around an idea of a super-container with vfio, it's maybe
> something we'd want to incorporate into this design. For instance, if
> memory could be pre-registered with a super container, which would
> handle the locked memory accounting for that memory, then
> sub-containers could all handle the IOMMU context of their sets of
> devices relative to that common memory pool.
This is where I suggested to David to use nesting of IOASIDs.
Without HW support for nesting a SW nest is really just re-using the
memory registration information stored in the parent when constructing
the children
Jason
On Thu, May 06, 2021 at 09:32:40AM -0700, Raj, Ashok wrote:
> For platforms that support ENQCMD, it is required to mandate PASIDs are
> global across the entire system. Maybe its better to call them gPASID for
> guest and hPASID for host. Short reason being gPASID->hPASID is a guest
> wide mapping for ENQCMD and not a per-RID based mapping. (We covered that
> in earlier responses)
I don't think it is actually ENQCMD that forces this, ENQCMD can use a
per-RID PASID in the translation table as well.
You get forced here only based on the design of the vIOMMU
communication channel. If the guest can demand any RID is attached to
a specific guest determined PASID then the hypervisor must accommodate
that.
> > Which would be a different behavior than something like Intel's top
> > level IOASID that doesn't claim all the PASIDs.
>
> isn't this simple, if we can say ioasid allocator can provide
>
> - system wide PASID
> - RID local PASID
>
> Based on platform capabilities that require such differentiation?
I think at the uAPI level the callpaths that require allocating a
PASID from a group of RIDs should be explicit in their intention and
not implicitly rely on a certain allocator behavior.
If you want to get a PASID that can be used with every RID on in your
/dev/ioasid then ask for that exactly.
It makes the uAPI much more understandable to be explicit.
Jason
On Fri, 7 May 2021 07:36:49 +0000
"Tian, Kevin" <[email protected]> wrote:
> > From: Alex Williamson <[email protected]>
> > Sent: Wednesday, April 28, 2021 11:06 PM
> >
> > On Wed, 28 Apr 2021 06:34:11 +0000
> > "Tian, Kevin" <[email protected]> wrote:
> > >
> > > Can you or Alex elaborate where the complexity and performance problem
> > > locate in VFIO map/umap? We'd like to understand more detail and see
> > how
> > > to avoid it in the new interface.
> >
> >
> > The map/unmap interface is really only good for long lived mappings,
> > the overhead is too high for things like vIOMMU use cases or any case
> > where the mapping is intended to be dynamic. Userspace drivers must
> > make use of a long lived buffer mapping in order to achieve performance.
>
> This is not a limitation of VFIO map/unmap. It's the limitation of any
> map/unmap semantics since the fact of long-lived vs. short-lived is
> imposed by userspace. Nested translation is the only viable optimization
> allowing 2nd-level to be a long-lived mapping even w/ vIOMMU. From
> this angle I'm not sure how a new map/unmap implementation could
> address this perf limitation alone.
Sure, we don't need to try to tackle every problem at once, a map/unmap
interface compatible with what we have is a good place to start and
nested translation may provide the high performance option. That's not
to say that we couldn't, in the future, extend the map/unmap with memory
pre-registration like done in the spapr IOMMU to see how that could
reduce latency.
> > The mapping and unmapping granularity has been a problem as well,
> > type1v1 allowed arbitrary unmaps to bisect the original mapping, with
> > the massive caveat that the caller relies on the return value of the
> > unmap to determine what was actually unmapped because the IOMMU use
> > of
> > superpages is transparent to the caller. This led to type1v2 that
> > simply restricts the user to avoid ever bisecting mappings. That still
> > leaves us with problems for things like virtio-mem support where we
> > need to create initial mappings with a granularity that allows us to
> > later remove entries, which can prevent effective use of IOMMU
> > superpages.
>
> We could start with a semantics similar to type1v2.
>
> btw why does virtio-mem require a smaller granularity? Can we split
> superpages in-the-fly when removal actually happens (just similar
> to page split in VM live migration for efficient dirty page tracking)?
The IOMMU API doesn't currently support those semantics. If the IOMMU
used a superpage, then a superpage gets unmapped, it doesn't get
atomically broken down into smaller pages. Therefore virtio-mem
proposes a fixed mapping granularity to allow for that same unmapping
granularity.
> and isn't it another problem imposed by userspace? How could a new
> map/unmap implementation mitigate this problem if the userspace
> insists on a smaller granularity for initial mappings?
Currently if userspace wants to guarantee unmap granularity, they need
to make the same restriction themselves on the mapping granularity.
For instance, userspace cannot currently map a 1GB IOVA range while
guaranteeing 2MB unmap granularity of that range with a single ioctl.
Instead userspace would need to make 512, 2MB mappings calls.
> > Locked page accounting has been another constant issue. We perform
> > locked page accounting at the container level, where each container
> > accounts independently. A user may require multiple containers, the
> > containers may pin the same physical memory, but be accounted against
> > the user once per container.
>
> for /dev/ioasid there is still an open whether an process is allowed to
> open /dev/ioasid once or multiple times. If there is only one ioasid_fd
> per process, the accounting can be made accurately. otherwise the
> same problem still exists as each ioasid_fd is akin to the container, then
> we need find a better solution.
We had tossed around an idea of a super-container with vfio, it's maybe
something we'd want to incorporate into this design. For instance, if
memory could be pre-registered with a super container, which would
handle the locked memory accounting for that memory, then
sub-containers could all handle the IOMMU context of their sets of
devices relative to that common memory pool.
> > Those are the main ones I can think of. It is nice to have a simple
> > map/unmap interface, I'd hope that a new /dev/ioasid interface wouldn't
> > raise the barrier to entry too high, but the user needs to have the
> > ability to have more control of their mappings and locked page
> > accounting should probably be offloaded somewhere. Thanks,
> >
>
> Based on your feedbacks I feel it's probably reasonable to start with
> a type1v2 semantics for the new interface. Locked accounting could
> also start with the same VFIO restriction and then improve it
> incrementally, if a cleaner way is intrusive (if not affecting uAPI).
> But I didn't get the suggestion on "more control of their mappings".
> Can you elaborate?
Things like I note above, userspace cannot currently specify mapping
granularity nor has any visibility to the granularity they get from the
IOMMU. What actually happens in the IOMMU is pretty opaque to the user
currently. Thanks,
Alex
On Fri, May 07, 2021 at 02:20:51PM -0300, Jason Gunthorpe wrote:
> On Thu, May 06, 2021 at 09:32:40AM -0700, Raj, Ashok wrote:
>
> > For platforms that support ENQCMD, it is required to mandate PASIDs are
> > global across the entire system. Maybe its better to call them gPASID for
> > guest and hPASID for host. Short reason being gPASID->hPASID is a guest
> > wide mapping for ENQCMD and not a per-RID based mapping. (We covered that
> > in earlier responses)
>
> I don't think it is actually ENQCMD that forces this, ENQCMD can use a
> per-RID PASID in the translation table as well.
When using ENQCMD the PASID that needs to be sent on the wire is picked
from an MSR setup by kernel. This is context switched along with the
process. So each process has only 1 PASID that can go out when using
ENQCMD. ENQCMD takes one mmio address specific to the acclerator and a
source for the descriptor.
When one application is connecting to more than one accelerator since this
is MSR based filled in by the cpu instruction automaticaly requires both
accelerators to use the same PASID.
Did you refer to this implementation? or something else?
>
> You get forced here only based on the design of the vIOMMU
> communication channel. If the guest can demand any RID is attached to
> a specific guest determined PASID then the hypervisor must accommodate
> that.
True, but when we have guest using vSVM, and enabling vENQCMD the
requirement is the same inside a guest.
>
> > > Which would be a different behavior than something like Intel's top
> > > level IOASID that doesn't claim all the PASIDs.
> >
> > isn't this simple, if we can say ioasid allocator can provide
> >
> > - system wide PASID
> > - RID local PASID
> >
> > Based on platform capabilities that require such differentiation?
>
> I think at the uAPI level the callpaths that require allocating a
> PASID from a group of RIDs should be explicit in their intention and
> not implicitly rely on a certain allocator behavior.
The difficult part I see is, when one application establishes a path to one
acclerator, we have no knowledge if its going to connect to a second, third
or such. I don't see how this can work reasonably well. What if PASIDx is allocated
for one, but the second RID its trying to attach already has this
PASID allocated?
>
> If you want to get a PASID that can be used with every RID on in your
> /dev/ioasid then ask for that exactly.
Correct, but how does guest through vIOMMU driver communicate that intent so uAPI
plumbing can do this? I mean architecturally via IOMMU interfaces?
Cheers,
Ashok
On Fri, May 07, 2021 at 11:14:58AM -0700, Raj, Ashok wrote:
> On Fri, May 07, 2021 at 02:20:51PM -0300, Jason Gunthorpe wrote:
> > On Thu, May 06, 2021 at 09:32:40AM -0700, Raj, Ashok wrote:
> >
> > > For platforms that support ENQCMD, it is required to mandate PASIDs are
> > > global across the entire system. Maybe its better to call them gPASID for
> > > guest and hPASID for host. Short reason being gPASID->hPASID is a guest
> > > wide mapping for ENQCMD and not a per-RID based mapping. (We covered that
> > > in earlier responses)
> >
> > I don't think it is actually ENQCMD that forces this, ENQCMD can use a
> > per-RID PASID in the translation table as well.
>
> When using ENQCMD the PASID that needs to be sent on the wire is picked
> from an MSR setup by kernel. This is context switched along with the
> process. So each process has only 1 PASID that can go out when using
> ENQCMD. ENQCMD takes one mmio address specific to the acclerator and a
> source for the descriptor.
Oh. I forgot this also globally locked the PASID to a single
MSR. Sigh. That makes the whole mechanism useless for anything except
whole process SVA.
It also make it a general kernel problem and not just related to the
vIOMMU scenario.
> > I think at the uAPI level the callpaths that require allocating a
> > PASID from a group of RIDs should be explicit in their intention and
> > not implicitly rely on a certain allocator behavior.
>
> The difficult part I see is, when one application establishes a path
> to one acclerator, we have no knowledge if its going to connect to a
> second, third or such. I don't see how this can work reasonably
> well. What if PASIDx is allocated for one, but the second RID its
> trying to attach already has this PASID allocated?
You mean like some kind of vIOMMU hot plug?
> > If you want to get a PASID that can be used with every RID on in your
> > /dev/ioasid then ask for that exactly.
>
> Correct, but how does guest through vIOMMU driver communicate that intent so uAPI
> plumbing can do this? I mean architecturally via IOMMU interfaces?
I would have to ask for a PASID that has the property it needs. You
are saying the property is even bigger than "usable on a group of
RIDs" but is actually "global for every RID and IOMMU in the system so
it can go into a MSR". Gross, but fine, ask for that explicitly when
allocating the PASID.
Jason
Hi Jason
- Removed lizefan's email due to bounces...
On Fri, May 07, 2021 at 03:20:50PM -0300, Jason Gunthorpe wrote:
> On Fri, May 07, 2021 at 11:14:58AM -0700, Raj, Ashok wrote:
> > On Fri, May 07, 2021 at 02:20:51PM -0300, Jason Gunthorpe wrote:
> > > On Thu, May 06, 2021 at 09:32:40AM -0700, Raj, Ashok wrote:
> > >
> > > > For platforms that support ENQCMD, it is required to mandate PASIDs are
> > > > global across the entire system. Maybe its better to call them gPASID for
> > > > guest and hPASID for host. Short reason being gPASID->hPASID is a guest
> > > > wide mapping for ENQCMD and not a per-RID based mapping. (We covered that
> > > > in earlier responses)
> > >
> > > I don't think it is actually ENQCMD that forces this, ENQCMD can use a
> > > per-RID PASID in the translation table as well.
> >
> > When using ENQCMD the PASID that needs to be sent on the wire is picked
> > from an MSR setup by kernel. This is context switched along with the
> > process. So each process has only 1 PASID that can go out when using
> > ENQCMD. ENQCMD takes one mmio address specific to the acclerator and a
> > source for the descriptor.
>
> Oh. I forgot this also globally locked the PASID to a single
> MSR. Sigh. That makes the whole mechanism useless for anything except
> whole process SVA.
Is there another kind of SVA? Our mapping from that each process requires a
single mm, and PASID for SVM was a direct map from that.
>
> It also make it a general kernel problem and not just related to the
> vIOMMU scenario.
>
> > > I think at the uAPI level the callpaths that require allocating a
> > > PASID from a group of RIDs should be explicit in their intention and
> > > not implicitly rely on a certain allocator behavior.
> >
> > The difficult part I see is, when one application establishes a path
> > to one acclerator, we have no knowledge if its going to connect to a
> > second, third or such. I don't see how this can work reasonably
> > well. What if PASIDx is allocated for one, but the second RID its
> > trying to attach already has this PASID allocated?
>
> You mean like some kind of vIOMMU hot plug?
Not vIOMMU hot plug. but an application opens accel1, does a bind to
allocate a PASID. What i meant was kernel has no information if this needs
to be a per-RID PASID, or a global PASID. Keeping this global solves the
other problems or more complex mechanisms to say "Reserve this PASID on all
accelerators" which seems pretty complicated to implement.
Now are we loosing anything by keeping the PASIDs global?
As we discussed there is no security issue since the PASID table that hosts
these PASIDs for SVM are still per-RID. For e.g.
app establishes connection to accl1, allocates PASID-X
RID for accel1 now has PASID-X and the process mm plummed
later app also connects with accl2, now the PASID-X is plummed in for RID
of accel2.
>
> > > If you want to get a PASID that can be used with every RID on in your
> > > /dev/ioasid then ask for that exactly.
> >
> > Correct, but how does guest through vIOMMU driver communicate that intent so uAPI
> > plumbing can do this? I mean architecturally via IOMMU interfaces?
>
> I would have to ask for a PASID that has the property it needs. You
> are saying the property is even bigger than "usable on a group of
> RIDs" but is actually "global for every RID and IOMMU in the system so
> it can go into a MSR". Gross, but fine, ask for that explicitly when
> allocating the PASID.
If one process has a single mm, is that also gross? :-) So a single process
having a PASID is just an identifier for IOMMU. It just seems like what a
mm is for a process == PASID for SVM-IOMMU support.
The unanswered question is how do we plumb from vIOMMU without a custom
allocator to get a system wide PASID?
The way it works today is if we have a custom allocator registered, that's
the mechanics to get PASIDs allocated. for Intel vIOMMU it happens to be a
global unique allocation. If a particular vIOMMU doesn't require, it does
not have vIOMMU interface, and those naturally get a guest local PASID name
space. (Im not sure if that's how the allocator works today, but I guess its
extensible to accomplish a RID local PASID if that's exactly what is
required)
Cheers,
Ashok
On Fri, May 07, 2021 at 12:23:25PM -0700, Raj, Ashok wrote:
> Hi Jason
>
> - Removed lizefan's email due to bounces...
>
> On Fri, May 07, 2021 at 03:20:50PM -0300, Jason Gunthorpe wrote:
> > On Fri, May 07, 2021 at 11:14:58AM -0700, Raj, Ashok wrote:
> > > On Fri, May 07, 2021 at 02:20:51PM -0300, Jason Gunthorpe wrote:
> > > > On Thu, May 06, 2021 at 09:32:40AM -0700, Raj, Ashok wrote:
> > > >
> > > > > For platforms that support ENQCMD, it is required to mandate PASIDs are
> > > > > global across the entire system. Maybe its better to call them gPASID for
> > > > > guest and hPASID for host. Short reason being gPASID->hPASID is a guest
> > > > > wide mapping for ENQCMD and not a per-RID based mapping. (We covered that
> > > > > in earlier responses)
> > > >
> > > > I don't think it is actually ENQCMD that forces this, ENQCMD can use a
> > > > per-RID PASID in the translation table as well.
> > >
> > > When using ENQCMD the PASID that needs to be sent on the wire is picked
> > > from an MSR setup by kernel. This is context switched along with the
> > > process. So each process has only 1 PASID that can go out when using
> > > ENQCMD. ENQCMD takes one mmio address specific to the acclerator and a
> > > source for the descriptor.
> >
> > Oh. I forgot this also globally locked the PASID to a single
> > MSR. Sigh. That makes the whole mechanism useless for anything except
> > whole process SVA.
>
> Is there another kind of SVA? Our mapping from that each process requires a
> single mm, and PASID for SVM was a direct map from that.
There are lots of potential applications for something like ENQCMD
that are not whole process SVA. Linking it to a single PASID basically
nukes any other use of it unfortunately.
> > I would have to ask for a PASID that has the property it needs. You
> > are saying the property is even bigger than "usable on a group of
> > RIDs" but is actually "global for every RID and IOMMU in the system so
> > it can go into a MSR". Gross, but fine, ask for that explicitly when
> > allocating the PASID.
>
> If one process has a single mm, is that also gross? :-) So a single process
> having a PASID is just an identifier for IOMMU. It just seems like what a
> mm is for a process == PASID for SVM-IOMMU support.
>
> The unanswered question is how do we plumb from vIOMMU without a custom
> allocator to get a system wide PASID?
PASID allocation is part of the iommu driver, it really shouldn't be
global.
When the architecture code goes to allocate a single PASID for the
mm_struct it should flag that allocation request with a 'must work for
all RIDs flag' and the iommu driver should take care of it. That might
mean the iommu driver consults a global static xarray, or maybe it
does a hypercall, but it should be done through that API, not a side
care global singleton.
Jason
Hi Jason,
On Fri, 7 May 2021 16:28:10 -0300, Jason Gunthorpe <[email protected]> wrote:
> > The unanswered question is how do we plumb from vIOMMU without a custom
> > allocator to get a system wide PASID?
>
> PASID allocation is part of the iommu driver, it really shouldn't be
> global.
>
In the current code, the pluggable custom allocator *is* part of the iommu
vendor driver. If it decides the allocation is global then it should be
suitable for the platform since there will never be a VT-d IOMMU on another
vendor's platform.
It is true that the default allocator is global which suites the current
needs. I am just wondering if we are solving a problem does not exist yet.
> When the architecture code goes to allocate a single PASID for the
> mm_struct it should flag that allocation request with a 'must work for
> all RIDs flag' and the iommu driver should take care of it. That might
> mean the iommu driver consults a global static xarray, or maybe it
> does a hypercall, but it should be done through that API, not a side
> care global singleton.
Why do we need to flag the allocation every time if on a platform *every*
PASID can potentially be global? At the time of allocation, we don't know
if the PASID will be used for a shared (ENQCMD) or a dedicated workqueue.
Thanks,
Jacob
> From: Jason Gunthorpe <[email protected]>
> Sent: Thursday, April 29, 2021 4:46 AM
>
> > > I think the name IOASID is fine for the uAPI, the kernel version can
> > > be called ioasid_id or something.
> >
> > ioasid is already an id and then ioasid_id just adds confusion. Another
> > point is that ioasid is currently used to represent both PCI PASID and
> > ARM substream ID in the kernel. It implies that if we want to separate
> > ioasid and pasid in the uAPI the 'pasid' also needs to be replaced with
> > another general term usable for substream ID. Are we making the
> > terms too confusing here?
>
> This is why I also am not so sure about exposing the PASID in the API
> because it is ultimately a HW specific item.
>
> As I said to David, one avenue is to have some generic uAPI that is
> very general and keep all this deeply detailed stuff, that really only
> matters for qemu, as part of a more HW specific vIOMMU driver
> interface.
>
OK, so the general uAPI will not expose hw_id and just provide everything
generic for managing I/O page table (map/unmap, nesting, etc.) through
IOASID and then specific uAPI is provided to handle platform specific
requirements (hw_id, iova windows, etc.)
Thanks
Kevin
> From: Jason Gunthorpe
> Sent: Saturday, May 8, 2021 1:11 AM
>
> On Fri, May 07, 2021 at 11:06:14AM -0600, Alex Williamson wrote:
>
> > We had tossed around an idea of a super-container with vfio, it's maybe
> > something we'd want to incorporate into this design. For instance, if
> > memory could be pre-registered with a super container, which would
> > handle the locked memory accounting for that memory, then
> > sub-containers could all handle the IOMMU context of their sets of
> > devices relative to that common memory pool.
>
> This is where I suggested to David to use nesting of IOASIDs.
>
> Without HW support for nesting a SW nest is really just re-using the
> memory registration information stored in the parent when constructing
> the children
>
yes, this sounds a sensible thing to do. it also unifies the user experience
regardless of whether the underlying hw supports nesting, e.g. when
vIOMMU is present Qemu can always use IOASID nesting uAPI. In case
of SW nest then the kernel will merge the two-level translations from two
IOASIDs into one-level shadow page table (unlike today's VFIO which has
the userspace to manage shadow-based mapping).
but want to remark that nesting IOASIDs alone cannot solve this accounting
problem completely, as long as a process is allowed to have multiple ioasid
FDs (unless there is a mechanism to allow nesting IOASIDs cross FDs). But
this is probably not a big issue. With all the intended usages around the
new interface, I think for most applications one ioasid FD should be sufficient
to meet their requirements (multiple gpa_ioasids, ioasid nesting, etc.).
Thanks
Kevin
Hi Jason,
On Wed, 5 May 2021 19:21:20 -0300, Jason Gunthorpe wrote:
> On Wed, May 05, 2021 at 01:04:46PM -0700, Jacob Pan wrote:
> > Hi Jason,
> >
> > On Wed, 5 May 2021 15:00:23 -0300, Jason Gunthorpe <[email protected]> wrote:
> >
> > > On Wed, May 05, 2021 at 10:22:59AM -0700, Jacob Pan wrote:
> > >
> > > > Global and pluggable are for slightly separate reasons.
> > > > - We need global PASID on VT-d in that we need to support shared
> > > > workqueues (SWQ). E.g. One SWQ can be wrapped into two mdevs then
> > > > assigned to two VMs. Each VM uses its private guest PASID to submit
> > > > work but each guest PASID must be translated to a global (system-wide)
> > > > host PASID to avoid conflict. Also, since PASID table storage is per
> > > > PF, if two mdevs of the same PF are assigned to different VMs, the
> > > > PASIDs must be unique.
> > >
> > > From a protocol perspective each RID has a unique PASID table, and
> > > RIDs can have overlapping PASIDs.
> > >
> > True, per RID or per PF as I was referring to.
> >
> > > Since your SWQ is connected to a single RID the requirement that
> > > PASIDs are unique to the RID ensures they are sufficiently unique.
> > >
> > True, but one process can submit work to multiple mdevs from different
> > RIDs/PFs. One process uses one PASID and PASID translation table is per VM.
> > The same PASID is used for all the PASID tables of each RID.
>
> If the model is "assign this PASID to this RID" then yes, there is a
> big problem keeping everything straight that can only be solved with a
> global table.
>
> But if the model is "give me a PASID for this RID" then it isn't such
> a problem.
Let me double confirm if I'm understanding you correctly. So your suggestion
is to have a per-RID PASID namespace, which can be maintainer by IOMMU driver.
right? Take native SVM usage as an example, everytime a process is bound with
a device, a PASID within this RID will be allocated. Am I correct so far?
If yes, then there is a case in which IOTLB efficiency is really low. Let's ay
there is a process bound with multiple devices(RIDs) and has different PASIDs
allocated for each RID. In such case, the PASID values are different for each
RID. As most vendor will do, PASID will be used to tag IOTLB entries. So in such
case, here will be multiple IOTLB entries for a single VA->PA mapping. And the
number of such duplicate IOTLB entries increases linearly per the number of the
device number. Seems not good from performance perspective.
>
> Basically trying to enforce a uniform PASID for an IOASID across all
> RIDs attached to it is not such a nice choice.
>
> > > That is fine, but all this stuff should be inside the Intel vIOMMU
> > > driver not made into a global resource of the entire iommu subsystem.
> > >
> > Intel vIOMMU has to use a generic uAPI to allocate PASID so the generic
> > code need to have this option. I guess you are saying we should also have a
> > per RID allocation option in addition to global?
>
> There always has to be a RID involvement for the PASID, for security,
> this issue really boils down to where the PASID lives.
>
> If you need the PASID attached to the IOASID then it has to be global
> because the IOASID can be attached to any RID and must keep the same
> PASID.
>
> If the PASID is learned when the IOASID is attached to a RID then the
> PASID is more flexible and isn't attached to the IOASID.
>
> Honestly I'm a little leary to bake into a UAPI a specific HW choice
> that Intel made here.
>
> I would advise making the "attach a global PASID to this IOASID"
> operation explicit and opt into for case that actually need it.
>
> Which implies the API to the iommu driver should be more like:
>
> 'assign an IOASID to this RID and return the PASID'
> 'reserve a PASID from every RID'
> 'assign an IOASID to this RID and use this specific PASID'
>
> In all cases the scope of those operations are completely local to a
> certain IOMMU driver - 'reserver a PASID from every RID' is really
> every RID that driver can operate on.
Also, this reservation will be failed if the PASID happens to be occupied
by previous usage. As the PASID translation table is per-VM, ENQCMD in VM
will be a problem under such PASID management model.
>
> So it is hard to see why the allocator should be a global resource and
> not something that is part of the iommu driver exclusively.
>
> Jason
--
Regards,
Yi Liu
> From: Alex Williamson <[email protected]>
> Sent: Saturday, May 8, 2021 1:06 AM
>
> > > Those are the main ones I can think of. It is nice to have a simple
> > > map/unmap interface, I'd hope that a new /dev/ioasid interface wouldn't
> > > raise the barrier to entry too high, but the user needs to have the
> > > ability to have more control of their mappings and locked page
> > > accounting should probably be offloaded somewhere. Thanks,
> > >
> >
> > Based on your feedbacks I feel it's probably reasonable to start with
> > a type1v2 semantics for the new interface. Locked accounting could
> > also start with the same VFIO restriction and then improve it
> > incrementally, if a cleaner way is intrusive (if not affecting uAPI).
> > But I didn't get the suggestion on "more control of their mappings".
> > Can you elaborate?
>
> Things like I note above, userspace cannot currently specify mapping
> granularity nor has any visibility to the granularity they get from the
> IOMMU. What actually happens in the IOMMU is pretty opaque to the user
> currently. Thanks,
>
It's much clearer. Based on all the discussions so far I'm thinking about
a staging approach when building the new interface, basically following
the model that Jason pointed out - generic stuff first, then platform
specific extension:
Phase 1: /dev/ioasid with core ingredients and vfio type1v2 semantics
- ioasid is the software handle representing an I/O page table
- uAPI accepts a type1v2 map/unmap semantics per ioasid
- helpers for VFIO/VDPA to bind ioasid_fd and attach ioasids
- multiple ioasids are allowed without nesting (vIOMMU, or devices
w/ incompatible iommu attributes)
- an ioasid disallows any operation before it's attached to a device
- an ioasid inherits iommu attributes from the 1st device attached
to it
- userspace is expected to manage hardware restrictions and the
kernel only returns error when restrictions are broken
* map/unmap on an ioasid will fail before every device in a group
is attached to it
* ioasid attach will fail if the new device has incompatibile iommu
attribute as that of this ioasid
- thus no group semantics in uAPI
- no change to vfio container/group/type1 logic, for running existing
vfio applications
* imply some duplication between vfio type1 and ioasid for some time
- new uAPI in vfio to allow explicit opening of a device and then binding
it to the ioasid_fd
* possibly require each device exposed in /dev/vfio/
- support both pdev and mdev
Phase 2: ioasid nesting
- Allow bind/unbind_pgtable semantics per ioasid
- Allow ioasid nesting
* HW ioasid nesting if supported by platform
* otherwise fall back to SW ioasid nesting (in-kernel shadowing)
- iotlb invalidation per ioasid
- I/O page fault handling per ioasid
- hw_id is not exposed in uAPI. Vendor IOMMU driver decides
when/how hw_id is allocated and programmed properly
Phase3: optimizations and vendor extensions (order undefined, up to
the specific feature owner):
- (Intel) ENQCMD support with hw_id exposure in uAPI
- (ARM/AMD) RID-based pasid table assignment
- (PPC) window-based iova management
- Optimizations:
* replace vfio type1 with a shim driver to use ioasid backend
* mapping granularity
* HW dirty page tracking
* ...
Does above sounds a sensible plan? If yes we'll start working on
phase1 then...
Thanks
Kevin
> From: Raj, Ashok <[email protected]>
> Sent: Friday, May 7, 2021 12:33 AM
>
> > Basically it means when the guest's top level IOASID is created for
> > nesting that IOASID claims all PASID's on the RID and excludes any
> > PASID IOASIDs from existing on the RID now or in future.
>
> The way to look at it this is as follows:
>
> For platforms that do not have a need to support shared work queue model
> support for ENQCMD or similar, PASID space is naturally per RID. There is no
> complication with this. Every RID has the full range of PASID's and no need
> for host to track which PASIDs are allocated now or in future in the guest.
>
> For platforms that support ENQCMD, it is required to mandate PASIDs are
> global across the entire system. Maybe its better to call them gPASID for
> guest and hPASID for host. Short reason being gPASID->hPASID is a guest
> wide mapping for ENQCMD and not a per-RID based mapping. (We covered
> that
> in earlier responses)
>
> In our current implementation we actually don't separate this space, and
> gPASID == hPASID. The iommu driver enforces that by using the custom
> allocator and the architected interface that allows all guest vIOMMU
> allocations to be proxied to host. Nothing but a glorified hypercall like
> interface. In fact some OS's do use hypercall to get a hPASID vs using
> the vCMD style interface.
>
After more thinking about the new interface, I feel gPASID==hPASID
actually causes some confusion in uAPI design. In concept an ioasid
is not active until it's attached to a device, because it's just an ID
if w/o a device. So supposedly an ioasid should reject all user commands
before attach. However an guest likely asks for a new gPASID before
attaching it to devices and vIOMMU. if gPASID==hPASID then Qemu
must request /dev/ioasid to allocate a hw_id for an ioasid which hasn't
been attached to any device, with the assumption on kernel knowledge
that this hw_id is from an global allocator w/o dependency on any
device. This doesn't sound a clean design, not to say it also conflicts
with live migration.
Want to hear your and Jason's opinion about an alternative option to
remove such restriction thus allowing gPASID!=hPASID.
gPASID!=hPASID has a problem when assigning a physical device which
supports both shared work queue (ENQCMD with PASID in MSR)
and dedicated work queue (PASID in device register) to a guest
process which is associated to a gPASID. Say the host kernel has setup
the hPASID entry with nested translation though /dev/ioasid. For
shared work queue the CPU is configured to translate gPASID in MSR
into **hPASID** before the payload goes out to the wire. However
for dedicated work queue the device MMIO register is directly mapped
to and programmed by the guest, thus containing a **gPASID** value
implying DMA requests through this interface will hit IOMMU faults
due to invalid gPASID entry. Having gPASID==hPASID is a simple
workaround here. mdev doesn't have this problem because the
PASID register is in emulated control-path thus can be translated
to hPASID manually by mdev driver.
Along this story one possible option is having both gPASID and hPASID
entries pointing to the same paging structure, sort of making gPASID
an aliasing hw_id to hPASID. Then we also need to make sure gPASID
range not colliding with hPASID range for this RID. Something like
below:
In the beginning Qemu specifies a minimal ID (say 1024) that hPASIDs
must be allocated beyond (sort of delegating [0, 1023] of this RID to
userspace):
ioctl(ioasid_fd, SET_IOASID_MIN_HWID, 1024);
The guest still uses vIOMMU interface or hypercall to allocate gPASIDs.
Upon such request, Qemu returns a gPASID from [0, 1023] to guest
and also allocates a new ioasid from /dev/ioasid. there is no hw_id
allocated at this step:
ioasid = ioctl(ioasid_fd, ALLOC_IOASID);
hw_id (hPASID) is allocated when attaching ioasid to the said device:
ioctl(device_fd, VFIO_ATTACH_IOASID, ioasid);
Then gPASID is provided as an aliasing hwid to this ioasid:
ioctl(device_fd, VFIO_ALIASING_IOASID, ioasid, gPASID);
Starting from this point the kernel should make sure that any ioasid
operation should be applied to both gPASID and hPASID for this RID
(entry setup, tear down, etc.) and both PASID entries point to the same
paging structures. When a page fault happens, the IOMMU driver
should also link a fault from either PASID back to the associated ioasid.
As explained earlier this aliasing requirement only applies to physical
devices on Intel platform. We may either have mdev ignore such
aliasing request, or have vfio device to report whether aliasing is allowed.
This is sort of a hybrid model. the gPASID range is reserved locally
in per-RID pasid space and delegated to userspace, while the hPASIDs
are still managed globally and not exposed to userspace.
Does it sound a cleaner approach (still w/ some complexity) compared
to the restrictions of having gPASID==hPASID?
Thanks
Kevin
On 5/8/21 3:31 PM, Tian, Kevin wrote:
>> From: Alex Williamson<[email protected]>
>> Sent: Saturday, May 8, 2021 1:06 AM
>>
>>>> Those are the main ones I can think of. It is nice to have a simple
>>>> map/unmap interface, I'd hope that a new /dev/ioasid interface wouldn't
>>>> raise the barrier to entry too high, but the user needs to have the
>>>> ability to have more control of their mappings and locked page
>>>> accounting should probably be offloaded somewhere. Thanks,
>>>>
>>> Based on your feedbacks I feel it's probably reasonable to start with
>>> a type1v2 semantics for the new interface. Locked accounting could
>>> also start with the same VFIO restriction and then improve it
>>> incrementally, if a cleaner way is intrusive (if not affecting uAPI).
>>> But I didn't get the suggestion on "more control of their mappings".
>>> Can you elaborate?
>> Things like I note above, userspace cannot currently specify mapping
>> granularity nor has any visibility to the granularity they get from the
>> IOMMU. What actually happens in the IOMMU is pretty opaque to the user
>> currently. Thanks,
>>
> It's much clearer. Based on all the discussions so far I'm thinking about
> a staging approach when building the new interface, basically following
> the model that Jason pointed out - generic stuff first, then platform
> specific extension:
>
> Phase 1: /dev/ioasid with core ingredients and vfio type1v2 semantics
> - ioasid is the software handle representing an I/O page table
A trivial proposal, is it possible to use /dev/ioas? Conceptually, it's
an IO address space representation and has nothing to do with any ID.
Best regards,
baolu
On Sat, May 08, 2021 at 09:56:59AM +0000, Tian, Kevin wrote:
> > From: Raj, Ashok <[email protected]>
> > Sent: Friday, May 7, 2021 12:33 AM
> >
> > > Basically it means when the guest's top level IOASID is created for
> > > nesting that IOASID claims all PASID's on the RID and excludes any
> > > PASID IOASIDs from existing on the RID now or in future.
> >
> > The way to look at it this is as follows:
> >
> > For platforms that do not have a need to support shared work queue model
> > support for ENQCMD or similar, PASID space is naturally per RID. There is no
> > complication with this. Every RID has the full range of PASID's and no need
> > for host to track which PASIDs are allocated now or in future in the guest.
> >
> > For platforms that support ENQCMD, it is required to mandate PASIDs are
> > global across the entire system. Maybe its better to call them gPASID for
> > guest and hPASID for host. Short reason being gPASID->hPASID is a guest
> > wide mapping for ENQCMD and not a per-RID based mapping. (We covered
> > that
> > in earlier responses)
> >
> > In our current implementation we actually don't separate this space, and
> > gPASID == hPASID. The iommu driver enforces that by using the custom
> > allocator and the architected interface that allows all guest vIOMMU
> > allocations to be proxied to host. Nothing but a glorified hypercall like
> > interface. In fact some OS's do use hypercall to get a hPASID vs using
> > the vCMD style interface.
> >
>
> After more thinking about the new interface, I feel gPASID==hPASID
> actually causes some confusion in uAPI design. In concept an ioasid
> is not active until it's attached to a device, because it's just an ID
> if w/o a device. So supposedly an ioasid should reject all user commands
> before attach. However an guest likely asks for a new gPASID before
> attaching it to devices and vIOMMU. if gPASID==hPASID then Qemu
> must request /dev/ioasid to allocate a hw_id for an ioasid which hasn't
> been attached to any device, with the assumption on kernel knowledge
> that this hw_id is from an global allocator w/o dependency on any
> device. This doesn't sound a clean design, not to say it also conflicts
> with live migration.
Everything must be explicit. The situation David pointed to of
qemu emulating a vIOMMU while running on a host with a different
platform/physical IOMMU must be considered.
If the vIOMMU needs specific behavior it must use /dev/iommu to ask
for it specifically and not just make wild assumptions about how the
platform works.
> gPASID!=hPASID has a problem when assigning a physical device which
> supports both shared work queue (ENQCMD with PASID in MSR)
> and dedicated work queue (PASID in device register) to a guest
> process which is associated to a gPASID. Say the host kernel has setup
> the hPASID entry with nested translation though /dev/ioasid. For
> shared work queue the CPU is configured to translate gPASID in MSR
> into **hPASID** before the payload goes out to the wire. However
> for dedicated work queue the device MMIO register is directly mapped
> to and programmed by the guest, thus containing a **gPASID** value
> implying DMA requests through this interface will hit IOMMU faults
> due to invalid gPASID entry. Having gPASID==hPASID is a simple
> workaround here. mdev doesn't have this problem because the
> PASID register is in emulated control-path thus can be translated
> to hPASID manually by mdev driver.
This all must be explicit too.
If a PASID is allocated and it is going to be used with ENQCMD then
everything needs to know it is actually quite different than a PASID
that was allocated to be used with a normal SRIOV device, for
instance.
The former case can accept that the guest PASID is virtualized, while
the lattter can not.
This is also why PASID per RID has to be an option. When I assign a
full SRIOV function to the guest then that entire RID space needs to
also be assigned to the guest. Upon migration I need to take all the
physical PASIDs and rebuild them in another hypervisor exactly as is.
If you force all RIDs into a global PASID pool then normal SRIOV
migration w/PASID becomes impossible. ie ENQCMD breaks everything else
that should work.
This is why you need to sort all this out and why it feels like some
of the specs here have been mis-designed.
I'm not sure carving out ranges is really workable for migration.
I think the real answer is to carve out entire RIDs as being in the
global pool or not. Then the ENQCMD HW can be bundled together and
everything else can live in the natural PASID per RID world.
Jason
On Mon, May 10, 2021 at 09:37:29AM -0300, Jason Gunthorpe wrote:
> On Sat, May 08, 2021 at 09:56:59AM +0000, Tian, Kevin wrote:
> > > From: Raj, Ashok <[email protected]>
> > > Sent: Friday, May 7, 2021 12:33 AM
> > >
> > > > Basically it means when the guest's top level IOASID is created for
> > > > nesting that IOASID claims all PASID's on the RID and excludes any
> > > > PASID IOASIDs from existing on the RID now or in future.
> > >
> > > The way to look at it this is as follows:
> > >
> > > For platforms that do not have a need to support shared work queue model
> > > support for ENQCMD or similar, PASID space is naturally per RID. There is no
> > > complication with this. Every RID has the full range of PASID's and no need
> > > for host to track which PASIDs are allocated now or in future in the guest.
> > >
> > > For platforms that support ENQCMD, it is required to mandate PASIDs are
> > > global across the entire system. Maybe its better to call them gPASID for
> > > guest and hPASID for host. Short reason being gPASID->hPASID is a guest
> > > wide mapping for ENQCMD and not a per-RID based mapping. (We covered
> > > that
> > > in earlier responses)
> > >
> > > In our current implementation we actually don't separate this space, and
> > > gPASID == hPASID. The iommu driver enforces that by using the custom
> > > allocator and the architected interface that allows all guest vIOMMU
> > > allocations to be proxied to host. Nothing but a glorified hypercall like
> > > interface. In fact some OS's do use hypercall to get a hPASID vs using
> > > the vCMD style interface.
> > >
> >
> > After more thinking about the new interface, I feel gPASID==hPASID
> > actually causes some confusion in uAPI design. In concept an ioasid
> > is not active until it's attached to a device, because it's just an ID
> > if w/o a device. So supposedly an ioasid should reject all user commands
> > before attach. However an guest likely asks for a new gPASID before
> > attaching it to devices and vIOMMU. if gPASID==hPASID then Qemu
> > must request /dev/ioasid to allocate a hw_id for an ioasid which hasn't
> > been attached to any device, with the assumption on kernel knowledge
> > that this hw_id is from an global allocator w/o dependency on any
> > device. This doesn't sound a clean design, not to say it also conflicts
> > with live migration.
>
> Everything must be explicit. The situation David pointed to of
> qemu emulating a vIOMMU while running on a host with a different
> platform/physical IOMMU must be considered.
>
> If the vIOMMU needs specific behavior it must use /dev/iommu to ask
> for it specifically and not just make wild assumptions about how the
> platform works.
I think the right way is for pIOMMU to enforce the right behavior. vIOMMU
can ask for a PASID and physical IOMMU driver would give what is optimal
for the platform. if vIOMMU says give me per-device PASID, but that can
lead to conflicts in PASID name space, its best to avoid it.
Global PASID doesn't break anything, but giving that control to vIOMMU
doesn't seem right. When we have mixed uses cases like hardware that
supports shared wq and SRIOV devices that need PASIDs we need to
comprehend how they will work without having a backend to migrate PASIDs
to new destination.
for ENQCMD we have the gPASID->hPASID translation in the VMCS control.
For devices that support SIOV, programming a PASID to a device is also
mediated, so its possible for something like the mediated interface to
assist with that migration for the dedicated WQ.
When we have both SRIOV and shared WQ exposed to the same guest, we
do have an issue. The simplest way that I thought was to have a guest
and host PASID separation. Where the guest has its own PASID space
and host has its own carved out. Guest can do what ever it wants within
that allocated space without fear of any collition with any other device.
Cheers,
Ashok
On Mon, May 10, 2021 at 08:25:02AM -0700, Raj, Ashok wrote:
> Global PASID doesn't break anything, but giving that control to vIOMMU
> doesn't seem right. When we have mixed uses cases like hardware that
> supports shared wq and SRIOV devices that need PASIDs we need to
> comprehend how they will work without having a backend to migrate PASIDs
> to new destination.
Why wouldn't there be a backend? SRIOV live migration is a real thing
now (see Max's VFIO patches). The PASID space of the entire dedicated
RID needs to be migratable, which means the destination vIOMMU must be
able to program its local hardware with the same PASID numbers and any
kind of global PASID scheme at all will interfere with it.
> When we have both SRIOV and shared WQ exposed to the same guest, we
> do have an issue. The simplest way that I thought was to have a guest
> and host PASID separation. Where the guest has its own PASID space
> and host has its own carved out. Guest can do what ever it wants within
> that allocated space without fear of any collition with any other device.
And how do you reliably migrate if the target kernel has a PASID
already allocated in that range?
ENQCMD must not assume it is the only thing in the platform. It needs
to be compartmentalized to specific participating RIDs and made
explicit because it has a bad special requirement for cross-device
PASIDs
Jason
On Mon, May 10, 2021 at 12:31:11PM -0300, Jason Gunthorpe wrote:
> On Mon, May 10, 2021 at 08:25:02AM -0700, Raj, Ashok wrote:
>
> > Global PASID doesn't break anything, but giving that control to vIOMMU
> > doesn't seem right. When we have mixed uses cases like hardware that
> > supports shared wq and SRIOV devices that need PASIDs we need to
> > comprehend how they will work without having a backend to migrate PASIDs
> > to new destination.
>
> Why wouldn't there be a backend? SRIOV live migration is a real thing
> now (see Max's VFIO patches). The PASID space of the entire dedicated
> RID needs to be migratable, which means the destination vIOMMU must be
> able to program its local hardware with the same PASID numbers and any
> kind of global PASID scheme at all will interfere with it.
The way I'm imagining it works is as follows. We have 2 types of platforms.
Let me know if i missed something.
- no shared wq, meaning RID local PASID allocation is all that's needed.
Say platform type p1
- Shared WQ configurations that require global PASIDs.
Say platform type p2
vIOMMU might have a capability to indicate if vIOMMU has a PASID allocation
capability. If there is none, guest is free to obtain its own PASID per
RID since they are fully isolated. For p1 type platforms this should work.
Since there is no Qemu interface even required or /dev/iommu for that
instance. Guest kernel can manage it fully per-RID based.
Platform type p2 that has both SIOV (with enqcmd) and SRIOV that requires
PASID. My thought was to reserve say some number of PASID's for per-RID
use. When you request a RID local you get PASID in that range. When you ask
for global, you get in the upper PASID range.
Say 0-4k is reserved for any RID local allocation. This is also the guest
PASID range. 4k->2^19 are for shared WQ. (I'm not implying the size, but
just for discussion sake that we need a separation.)
Those vIOMMU's will have a capability that it supports PASID allocation
interface. When you allocate you can say what type of PASID you need
(shared vs local) and Qemu will obtain either within the local range, or in
the shared range. When they are allocated in the shared range, you still
end up using one in guest PASID range, but mapped to a different
host-pasid using the VMCS like PASID redirection per-guest (not-perRID).
Only the shared allocation requires /dev/iommu interface. All allocation in
the guest range is fully in Qemu control.
For supporting migration, the target also needs to have this capability for
migration.
>
> > When we have both SRIOV and shared WQ exposed to the same guest, we
> > do have an issue. The simplest way that I thought was to have a guest
> > and host PASID separation. Where the guest has its own PASID space
> > and host has its own carved out. Guest can do what ever it wants within
> > that allocated space without fear of any collition with any other device.
>
> And how do you reliably migrate if the target kernel has a PASID
> already allocated in that range?
For any shared range, remember there is a mapping table. And since those
ranges are always reserved in the new host it isn't a problem. For shared
WQ that requires a PASID remapping to new host PASID, the VMCS remapping
per guest that does gPASID->hPASID does this job. So the guest PASID
remains unchanged.
Does this make sense?
Cheers,
Ashok
On Mon, May 10, 2021 at 09:22:12AM -0700, Raj, Ashok wrote:
> Those vIOMMU's will have a capability that it supports PASID allocation
> interface. When you allocate you can say what type of PASID you need
> (shared vs local) and Qemu will obtain either within the local range, or in
> the shared range.
Isn't this what I've been saying? This all has to be explicit, and it
is all local to the iommu driver. At worst we'd have some single
global API 'get me a global PASID' which co-ordinates with all the
iommu drivers to actually implement it.
> > > When we have both SRIOV and shared WQ exposed to the same guest, we
> > > do have an issue. The simplest way that I thought was to have a guest
> > > and host PASID separation. Where the guest has its own PASID space
> > > and host has its own carved out. Guest can do what ever it wants within
> > > that allocated space without fear of any collition with any other device.
> >
> > And how do you reliably migrate if the target kernel has a PASID
> > already allocated in that range?
>
> For any shared range, remember there is a mapping table. And since those
> ranges are always reserved in the new host it isn't a problem.
It is a smaller problem - all the ranges still need to match between
hosts and so forth. It is tractable but this all needs to be API'd
properly and nothing can be implicit, including the global/local range
split.
Basically all this needs to come through in your /dev/ioasid API RFC
proposal that I hope is being worked on.
I still think it is smarter to push a group of RID's into a global
allocation group and accept there are potential downsides with that
than to try to force a global allocation group on every RID and then
try to fix the mess that makes for non-ENQCMD devices.
Jason
Hi Jason,
On Mon, 10 May 2021 13:39:56 -0300, Jason Gunthorpe <[email protected]> wrote:
> I still think it is smarter to push a group of RID's into a global
> allocation group and accept there are potential downsides with that
> than to try to force a global allocation group on every RID and then
> try to fix the mess that makes for non-ENQCMD devices.
The proposed ioasid_set change in this set has a token for each set of
IOASIDs.
/**
* struct ioasid_set - Meta data about ioasid_set
* @nh: List of notifiers private to that set
* @xa: XArray to store ioasid_set private IDs, can be used for
* guest-host IOASID mapping, or just a private IOASID namespace.
* @token: Unique to identify an IOASID set
* @type: Token types
* @quota: Max number of IOASIDs can be allocated within the set
* @nr_ioasids: Number of IOASIDs currently allocated in the set
* @id: ID of the set
*/
struct ioasid_set {
struct atomic_notifier_head nh;
struct xarray xa;
void *token;
int type;
int quota;
atomic_t nr_ioasids;
int id;
struct rcu_head rcu;
struct misc_cg *misc_cg; /* For misc cgroup accounting */
};
To satisfy your "give me a PASID for this RID" proposal, can we just use
the RID's struct device as the token? Also add a type field to explicitly
indicate global vs per-set(per-RID). i.e.
ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
int type, void *private)
Where flags can be:
enum ioasid_hwid_type {
IOASID_HWID_GLOBAL,
IOASID_HWID_PER_SET,
};
We are really talking about the HW IOASID, just a reminder.
Thanks,
Jacob
On Mon, May 10, 2021 at 03:28:54PM -0700, Jacob Pan wrote:
> To satisfy your "give me a PASID for this RID" proposal, can we just use
> the RID's struct device as the token? Also add a type field to explicitly
> indicate global vs per-set(per-RID). i.e.
You've got it backwards, the main behavior should be to allocate PASID
per RID.
The special behavior is to bundle a bunch of PASIDs into a grouping
and then say the PASID number space is shared between all the group
members.
/dev/ioasid should create and own this grouping either implicitly or
explicitly. Jumping ahead to in-kernel APIs has missed the critical
step of defining the uAPI and all the behaviors together in a
completed RFC proposal.
Jason
Hi Jason,
On Mon, 10 May 2021 20:45:00 -0300, Jason Gunthorpe <[email protected]> wrote:
> On Mon, May 10, 2021 at 03:28:54PM -0700, Jacob Pan wrote:
>
> > To satisfy your "give me a PASID for this RID" proposal, can we just use
> > the RID's struct device as the token? Also add a type field to
> > explicitly indicate global vs per-set(per-RID). i.e.
>
> You've got it backwards, the main behavior should be to allocate PASID
> per RID.
>
Sure, we can make the local PASID as default. My point was that the
ioasid_set infrastructure's opaque token can support RID-local allocation
scheme. Anyway, this is a small detail as compared to uAPI.
> The special behavior is to bundle a bunch of PASIDs into a grouping
> and then say the PASID number space is shared between all the group
> members.
>
> /dev/ioasid should create and own this grouping either implicitly or
> explicitly. Jumping ahead to in-kernel APIs has missed the critical
> step of defining the uAPI and all the behaviors together in a
> completed RFC proposal.
>
Agreed, the requirements for kernel API should come from uAPI.
> Jason
Thanks,
Jacob
> From: Jason Gunthorpe
> Sent: Monday, May 10, 2021 8:37 PM
>
[...]
> > gPASID!=hPASID has a problem when assigning a physical device which
> > supports both shared work queue (ENQCMD with PASID in MSR)
> > and dedicated work queue (PASID in device register) to a guest
> > process which is associated to a gPASID. Say the host kernel has setup
> > the hPASID entry with nested translation though /dev/ioasid. For
> > shared work queue the CPU is configured to translate gPASID in MSR
> > into **hPASID** before the payload goes out to the wire. However
> > for dedicated work queue the device MMIO register is directly mapped
> > to and programmed by the guest, thus containing a **gPASID** value
> > implying DMA requests through this interface will hit IOMMU faults
> > due to invalid gPASID entry. Having gPASID==hPASID is a simple
> > workaround here. mdev doesn't have this problem because the
> > PASID register is in emulated control-path thus can be translated
> > to hPASID manually by mdev driver.
>
> This all must be explicit too.
>
> If a PASID is allocated and it is going to be used with ENQCMD then
> everything needs to know it is actually quite different than a PASID
> that was allocated to be used with a normal SRIOV device, for
> instance.
>
> The former case can accept that the guest PASID is virtualized, while
> the lattter can not.
>
> This is also why PASID per RID has to be an option. When I assign a
> full SRIOV function to the guest then that entire RID space needs to
> also be assigned to the guest. Upon migration I need to take all the
> physical PASIDs and rebuild them in another hypervisor exactly as is.
>
> If you force all RIDs into a global PASID pool then normal SRIOV
> migration w/PASID becomes impossible. ie ENQCMD breaks everything else
> that should work.
>
> This is why you need to sort all this out and why it feels like some
> of the specs here have been mis-designed.
>
> I'm not sure carving out ranges is really workable for migration.
>
> I think the real answer is to carve out entire RIDs as being in the
> global pool or not. Then the ENQCMD HW can be bundled together and
> everything else can live in the natural PASID per RID world.
>
OK. Here is the revised scheme by making it explicitly.
There are three scenarios to be considered:
1) SR-IOV (AMD/ARM):
- "PASID per RID" with guest-allocated PASIDs;
- PASID table managed by guest (in GPA space);
- the entire PASID space delegated to guest;
- no need to explicitly register guest-allocated PASIDs to host;
- uAPI for attaching PASID table:
// set to "PASID per RID"
ioctl(ioasid_fd, IOASID_SET_HWID_MODE, IOASID_HWID_LOCAL);
// When Qemu captures a new PASID table through vIOMMU;
pasidtbl_ioasid = ioctl(ioasid_fd, IOASID_ALLOC);
ioctl(device_fd, VFIO_ATTACH_IOASID, pasidtbl_ioasid);
// Set the PASID table to the RID associated with pasidtbl_ioasid;
ioctl(ioasid_fd, IOASID_SET_PASID_TABLE, pasidtbl_ioasid, gpa_addr);
2) SR-IOV, no ENQCMD (Intel):
- "PASID per RID" with guest-allocated PASIDs;
- PASID table managed by host (in HPA space);
- the entire PASID space delegated to guest too;
- host must be explicitly notified for guest-allocated PASIDs;
- uAPI for binding user-allocated PASIDs:
// set to "PASID per RID"
ioctl(ioasid_fd, IOASID_SET_HWID_MODE, IOASID_HWID_LOCAL);
// When Qemu captures a new PASID allocated through vIOMMU;
pgtbl_ioasid = ioctl(ioasid_fd, IOASID_ALLOC);
ioctl(device_fd, VFIO_ATTACH_IOASID, pgtbl_ioasid);
// Tell the kernel to associate pasid to pgtbl_ioasid in internal structure;
// &pasid being a pointer due to a requirement in scenario-3
ioctl(ioasid_fd, IOASID_SET_HWID, pgtbl_ioasid, &pasid);
// Set guest page table to the RID+pasid associated to pgtbl_ioasid
ioctl(ioasid_fd, IOASID_BIND_PGTABLE, pgtbl_ioasid, gpa_addr);
3) SRIOV, ENQCMD (Intel):
- "PASID global" with host-allocated PASIDs;
- PASID table managed by host (in HPA space);
- all RIDs bound to this ioasid_fd use the global pool;
- however, exposing global PASID into guest breaks migration;
- hybrid scheme: split local PASID range and global PASID range;
- force guest to use only local PASID range (through vIOMMU);
- for ENQCMD, configure CPU to translate local->global;
- for non-ENQCMD, setup both local/global pasid entries;
- uAPI for range split and CPU pasid mapping:
// set to "PASID global"
ioctl(ioasid_fd, IOASID_SET_HWID_MODE, IOASID_HWID_GLOBAL);
// split local/global range, applying to all RIDs in this fd
// Example: local [0, 1024), global [1024, max)
// local PASID range is managed by guest and migrated as VM state
// global PASIDs are re-allocated and mapped to local PASIDs post migration
ioctl(ioasid_fd, IOASID_HWID_SET_GLOBAL_MIN, 1024);
// When Qemu captures a new local_pasid allocated through vIOMMU;
pgtbl_ioasid = ioctl(ioasid_fd, IOASID_ALLOC);
ioctl(device_fd, VFIO_ATTACH_IOASID, pgtbl_ioasid);
// Tell the kernel to associate local_pasid to pgtbl_ioasid in internal structure;
// Due to HWID_GLOBAL, the kernel also allocates a global_pasid from the
// global pool. From now on, every hwid related operations must be applied
// to both PASIDs for this page table;
// global_pasid is returned to userspace in the same field as local_pasid;
ioctl(ioasid_fd, IOASID_SET_HWID, pgtbl_ioasid, &local_pasid);
// Qemu then registers local_pasid/global_pasid pair to KVM for setting up
// CPU PASID translation table;
ioctl(kvm_fd, KVM_SET_PASID_MAPPING, local_pasid, global_pasid);
// Set guest page table to the RID+{local_pasid, global_pasid} associated
// to pgtbl_ioasid;
ioctl(ioasid_fd, IOASID_BIND_PGTABLE, pgtbl_ioasid, gpa_addr);
-----
Notes:
I tried to keep common commands in generic format for all scenarios, while
introducing new commands for usage-specific requirement. Everything is
made explicit now.
The userspace has sufficient information to choose its desired scheme based
on vIOMMU types and platform information (e.g. whether ENQCMD is exposed
in virtual CPUID, whether assigned devices support DMWr, etc.).
Above example assumes one RID per bound page table, because vIOMMU
identifies new guest page tables per-RID. If there are other usages requiring
multiple RIDs per page table, SET_HWID/BIND_PGTABLE could accept
another device_handle parameter to specify which RID is targeted for this
operation.
When considering SIOV/mdev there is no change to above uAPI sequence.
It's n/a for 1) as SIOV requires PASID table in HPA space, nor does it
cause any change to 3) regarding to the split range scheme. The only
conceptual change is in 2), where although it's still "PASID per RID" the
PASIDs must be managed by host because the parent driver also allocates
PASIDs from per-RID space to mark mdev (RID+PASID). But this difference
doesn't change the uAPI flow - just treat user-provisioned PASID as 'virtual'
and then allocate a 'real' PASID at IOASID_SET_HWID. Later always use the
real one when programming PASID entry (IOASID_BIND_PGTABLE) or device
PASID register (converted in the mediation path).
If all above can work reasonably, we even don't need the special VCMD
interface in VT-d for guest to allocate PASIDs from host. Just always let
the guest to manage its PASIDs (with restriction of available local PASIDs),
being a global allocator or per-RID allocator. the vIOMMU side just stick
to the per-RID emulation according to the spec.
Thanks
Kevin
On Tue, 11 May 2021 09:10:03 +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe
> > Sent: Monday, May 10, 2021 8:37 PM
> >
> [...]
> > > gPASID!=hPASID has a problem when assigning a physical device which
> > > supports both shared work queue (ENQCMD with PASID in MSR)
> > > and dedicated work queue (PASID in device register) to a guest
> > > process which is associated to a gPASID. Say the host kernel has setup
> > > the hPASID entry with nested translation though /dev/ioasid. For
> > > shared work queue the CPU is configured to translate gPASID in MSR
> > > into **hPASID** before the payload goes out to the wire. However
> > > for dedicated work queue the device MMIO register is directly mapped
> > > to and programmed by the guest, thus containing a **gPASID** value
> > > implying DMA requests through this interface will hit IOMMU faults
> > > due to invalid gPASID entry. Having gPASID==hPASID is a simple
> > > workaround here. mdev doesn't have this problem because the
> > > PASID register is in emulated control-path thus can be translated
> > > to hPASID manually by mdev driver.
> >
> > This all must be explicit too.
> >
> > If a PASID is allocated and it is going to be used with ENQCMD then
> > everything needs to know it is actually quite different than a PASID
> > that was allocated to be used with a normal SRIOV device, for
> > instance.
> >
> > The former case can accept that the guest PASID is virtualized, while
> > the lattter can not.
> >
> > This is also why PASID per RID has to be an option. When I assign a
> > full SRIOV function to the guest then that entire RID space needs to
> > also be assigned to the guest. Upon migration I need to take all the
> > physical PASIDs and rebuild them in another hypervisor exactly as is.
> >
> > If you force all RIDs into a global PASID pool then normal SRIOV
> > migration w/PASID becomes impossible. ie ENQCMD breaks everything else
> > that should work.
> >
> > This is why you need to sort all this out and why it feels like some
> > of the specs here have been mis-designed.
> >
> > I'm not sure carving out ranges is really workable for migration.
> >
> > I think the real answer is to carve out entire RIDs as being in the
> > global pool or not. Then the ENQCMD HW can be bundled together and
> > everything else can live in the natural PASID per RID world.
> >
>
> OK. Here is the revised scheme by making it explicitly.
>
> There are three scenarios to be considered:
>
> 1) SR-IOV (AMD/ARM):
> - "PASID per RID" with guest-allocated PASIDs;
> - PASID table managed by guest (in GPA space);
> - the entire PASID space delegated to guest;
> - no need to explicitly register guest-allocated PASIDs to host;
> - uAPI for attaching PASID table:
>
> // set to "PASID per RID"
> ioctl(ioasid_fd, IOASID_SET_HWID_MODE, IOASID_HWID_LOCAL);
>
> // When Qemu captures a new PASID table through vIOMMU;
> pasidtbl_ioasid = ioctl(ioasid_fd, IOASID_ALLOC);
> ioctl(device_fd, VFIO_ATTACH_IOASID, pasidtbl_ioasid);
>
> // Set the PASID table to the RID associated with pasidtbl_ioasid;
> ioctl(ioasid_fd, IOASID_SET_PASID_TABLE, pasidtbl_ioasid, gpa_addr);
>
> 2) SR-IOV, no ENQCMD (Intel):
> - "PASID per RID" with guest-allocated PASIDs;
> - PASID table managed by host (in HPA space);
> - the entire PASID space delegated to guest too;
> - host must be explicitly notified for guest-allocated PASIDs;
> - uAPI for binding user-allocated PASIDs:
>
> // set to "PASID per RID"
> ioctl(ioasid_fd, IOASID_SET_HWID_MODE, IOASID_HWID_LOCAL);
>
> // When Qemu captures a new PASID allocated through vIOMMU;
Is this achieved by VCMD or by capturing guest's PASID cache invalidation?
> pgtbl_ioasid = ioctl(ioasid_fd, IOASID_ALLOC);
> ioctl(device_fd, VFIO_ATTACH_IOASID, pgtbl_ioasid);
>
> // Tell the kernel to associate pasid to pgtbl_ioasid in internal structure;
> // &pasid being a pointer due to a requirement in scenario-3
> ioctl(ioasid_fd, IOASID_SET_HWID, pgtbl_ioasid, &pasid);
>
> // Set guest page table to the RID+pasid associated to pgtbl_ioasid
> ioctl(ioasid_fd, IOASID_BIND_PGTABLE, pgtbl_ioasid, gpa_addr);
>
> 3) SRIOV, ENQCMD (Intel):
> - "PASID global" with host-allocated PASIDs;
> - PASID table managed by host (in HPA space);
> - all RIDs bound to this ioasid_fd use the global pool;
> - however, exposing global PASID into guest breaks migration;
> - hybrid scheme: split local PASID range and global PASID range;
> - force guest to use only local PASID range (through vIOMMU);
> - for ENQCMD, configure CPU to translate local->global;
> - for non-ENQCMD, setup both local/global pasid entries;
> - uAPI for range split and CPU pasid mapping:
>
> // set to "PASID global"
> ioctl(ioasid_fd, IOASID_SET_HWID_MODE, IOASID_HWID_GLOBAL);
>
> // split local/global range, applying to all RIDs in this fd
> // Example: local [0, 1024), global [1024, max)
> // local PASID range is managed by guest and migrated as VM state
> // global PASIDs are re-allocated and mapped to local PASIDs post migration
> ioctl(ioasid_fd, IOASID_HWID_SET_GLOBAL_MIN, 1024);
>
> // When Qemu captures a new local_pasid allocated through vIOMMU;
> pgtbl_ioasid = ioctl(ioasid_fd, IOASID_ALLOC);
> ioctl(device_fd, VFIO_ATTACH_IOASID, pgtbl_ioasid);
>
> // Tell the kernel to associate local_pasid to pgtbl_ioasid in internal structure;
> // Due to HWID_GLOBAL, the kernel also allocates a global_pasid from the
> // global pool. From now on, every hwid related operations must be applied
> // to both PASIDs for this page table;
> // global_pasid is returned to userspace in the same field as local_pasid;
> ioctl(ioasid_fd, IOASID_SET_HWID, pgtbl_ioasid, &local_pasid);
>
> // Qemu then registers local_pasid/global_pasid pair to KVM for setting up
> // CPU PASID translation table;
> ioctl(kvm_fd, KVM_SET_PASID_MAPPING, local_pasid, global_pasid);
>
> // Set guest page table to the RID+{local_pasid, global_pasid} associated
> // to pgtbl_ioasid;
> ioctl(ioasid_fd, IOASID_BIND_PGTABLE, pgtbl_ioasid, gpa_addr);
>
> -----
> Notes:
>
> I tried to keep common commands in generic format for all scenarios, while
> introducing new commands for usage-specific requirement. Everything is
> made explicit now.
>
> The userspace has sufficient information to choose its desired scheme based
> on vIOMMU types and platform information (e.g. whether ENQCMD is exposed
> in virtual CPUID, whether assigned devices support DMWr, etc.).
>
> Above example assumes one RID per bound page table, because vIOMMU
> identifies new guest page tables per-RID. If there are other usages requiring
> multiple RIDs per page table, SET_HWID/BIND_PGTABLE could accept
> another device_handle parameter to specify which RID is targeted for this
> operation.
>
> When considering SIOV/mdev there is no change to above uAPI sequence.
> It's n/a for 1) as SIOV requires PASID table in HPA space, nor does it
> cause any change to 3) regarding to the split range scheme. The only
> conceptual change is in 2), where although it's still "PASID per RID" the
> PASIDs must be managed by host because the parent driver also allocates
> PASIDs from per-RID space to mark mdev (RID+PASID). But this difference
> doesn't change the uAPI flow - just treat user-provisioned PASID as 'virtual'
> and then allocate a 'real' PASID at IOASID_SET_HWID. Later always use the
> real one when programming PASID entry (IOASID_BIND_PGTABLE) or device
> PASID register (converted in the mediation path).
>
> If all above can work reasonably, we even don't need the special VCMD
> interface in VT-d for guest to allocate PASIDs from host. Just always let
> the guest to manage its PASIDs (with restriction of available local PASIDs),
> being a global allocator or per-RID allocator. the vIOMMU side just stick
> to the per-RID emulation according to the spec.
yeah, if this scheme for scenario 3) is good. We may limit the range of
local PASIDs by limiting the PASID bit width of vIOMMU. QEMU can get the
local PASID allocated by guest IOMMU when guest does PASID cache invalidation.
--
Regards,
Yi Liu
On Tue, May 11, 2021 at 09:10:03AM +0000, Tian, Kevin wrote:
> 3) SRIOV, ENQCMD (Intel):
> - "PASID global" with host-allocated PASIDs;
> - PASID table managed by host (in HPA space);
> - all RIDs bound to this ioasid_fd use the global pool;
> - however, exposing global PASID into guest breaks migration;
> - hybrid scheme: split local PASID range and global PASID range;
> - force guest to use only local PASID range (through vIOMMU);
> - for ENQCMD, configure CPU to translate local->global;
> - for non-ENQCMD, setup both local/global pasid entries;
> - uAPI for range split and CPU pasid mapping:
>
> // set to "PASID global"
> ioctl(ioasid_fd, IOASID_SET_HWID_MODE, IOASID_HWID_GLOBAL);
>
> // split local/global range, applying to all RIDs in this fd
> // Example: local [0, 1024), global [1024, max)
> // local PASID range is managed by guest and migrated as VM state
> // global PASIDs are re-allocated and mapped to local PASIDs post migration
> ioctl(ioasid_fd, IOASID_HWID_SET_GLOBAL_MIN, 1024);
I'm still not sold that ranges are the best idea here, it just adds
more state that has to match during migration. Keeping the
global/local split per RID seems much cleaner to me
This is also why I don't really like having the global/local be global
to the ioasid either. It would be better to specify global/local as
part of each VFIO_ATTACH_IOASID so each device is moved to the correct
allocator.
> When considering SIOV/mdev there is no change to above uAPI sequence.
> It's n/a for 1) as SIOV requires PASID table in HPA space, nor does it
> cause any change to 3) regarding to the split range scheme. The only
> conceptual change is in 2), where although it's still "PASID per RID" the
> PASIDs must be managed by host because the parent driver also allocates
> PASIDs from per-RID space to mark mdev (RID+PASID). But this difference
> doesn't change the uAPI flow - just treat user-provisioned PASID as 'virtual'
> and then allocate a 'real' PASID at IOASID_SET_HWID. Later always use the
> real one when programming PASID entry (IOASID_BIND_PGTABLE) or device
> PASID register (converted in the mediation path).
It does need some user visible difference because SIOV/mdev is not
migratable. Only the kernel can select a PASID, userspace (and hence
the guest) shouldn't have the option to force a specific PASID as the
PASID space is shared across the entire RID to all VMs using the mdev.
I don't see any alternative to telling every part if the PASID is
going to be used by ENQCMD or not, too many important decisions rest
on this detail.
Jason
> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, May 11, 2021 10:39 PM
>
> On Tue, May 11, 2021 at 09:10:03AM +0000, Tian, Kevin wrote:
>
> > 3) SRIOV, ENQCMD (Intel):
> > - "PASID global" with host-allocated PASIDs;
> > - PASID table managed by host (in HPA space);
> > - all RIDs bound to this ioasid_fd use the global pool;
> > - however, exposing global PASID into guest breaks migration;
> > - hybrid scheme: split local PASID range and global PASID range;
> > - force guest to use only local PASID range (through vIOMMU);
> > - for ENQCMD, configure CPU to translate local->global;
> > - for non-ENQCMD, setup both local/global pasid entries;
> > - uAPI for range split and CPU pasid mapping:
> >
> > // set to "PASID global"
> > ioctl(ioasid_fd, IOASID_SET_HWID_MODE, IOASID_HWID_GLOBAL);
> >
> > // split local/global range, applying to all RIDs in this fd
> > // Example: local [0, 1024), global [1024, max)
> > // local PASID range is managed by guest and migrated as VM state
> > // global PASIDs are re-allocated and mapped to local PASIDs post
> migration
> > ioctl(ioasid_fd, IOASID_HWID_SET_GLOBAL_MIN, 1024);
>
> I'm still not sold that ranges are the best idea here, it just adds
> more state that has to match during migration. Keeping the
> global/local split per RID seems much cleaner to me
With ENQCMD the PASID is kept in CPU MSR, making it a process
context within the guest. When a guest process is bound to two
devices, the same local PASID must be usable on both devices.
Having per RID split cannot guarantee it.
>
> This is also why I don't really like having the global/local be global
> to the ioasid either. It would be better to specify global/local as
> part of each VFIO_ATTACH_IOASID so each device is moved to the correct
> allocator.
this was my original thought. But for above reason this has to be
a global enforcement in this ioasid fd.
>
> > When considering SIOV/mdev there is no change to above uAPI sequence.
> > It's n/a for 1) as SIOV requires PASID table in HPA space, nor does it
> > cause any change to 3) regarding to the split range scheme. The only
> > conceptual change is in 2), where although it's still "PASID per RID" the
> > PASIDs must be managed by host because the parent driver also allocates
> > PASIDs from per-RID space to mark mdev (RID+PASID). But this difference
> > doesn't change the uAPI flow - just treat user-provisioned PASID as 'virtual'
> > and then allocate a 'real' PASID at IOASID_SET_HWID. Later always use the
> > real one when programming PASID entry (IOASID_BIND_PGTABLE) or
> device
> > PASID register (converted in the mediation path).
>
> It does need some user visible difference because SIOV/mdev is not
> migratable. Only the kernel can select a PASID, userspace (and hence
> the guest) shouldn't have the option to force a specific PASID as the
> PASID space is shared across the entire RID to all VMs using the mdev.
not migratable only when you choose exposing host-allocated PASID
into guest. However in the entire this proposal we actually virtualize
PASIDs, letting the guest manage its own PASID space in all scenarios
(being SR-IOV or SIOV) though the size of PASID space might be different.
The PASID chosen by guest may be used as the hw PASID when the
PASID space is delegated to guest (e.g. SR-IOV in scenario 1), or is
mapped to a different PASID allocated by guest (e.g. in this mdev
case or ENQCMD in scenario-3). From uAPI p.o.v the userspace just
needs to attach its own pasid to ioasid while the kernel will decide
the real hwid underlyingly (being same or different one). Migration
only needs cover guest-allocated PASIDs, with all host-side PASIDs
are hidden from userspace and reconstructed in the new machine
post migration (following common virtualization practice).
The only exception where we return host-allocated PASID to userspace
in scenario-3 is because Qemu needs such information to update CPU
PASID translation table through KVM. Earlier you suggested that this
must be explicitly done through userspace instead of implicit notification
between ioasid and kvm in kernel.
>
> I don't see any alternative to telling every part if the PASID is
> going to be used by ENQCMD or not, too many important decisions rest
> on this detail.
>
> Jason
Thanks
Kevin
> From: Liu Yi L <[email protected]>
> Sent: Tuesday, May 11, 2021 9:25 PM
>
> On Tue, 11 May 2021 09:10:03 +0000, Tian, Kevin wrote:
>
> > > From: Jason Gunthorpe
> > > Sent: Monday, May 10, 2021 8:37 PM
> > >
> > [...]
> > > > gPASID!=hPASID has a problem when assigning a physical device which
> > > > supports both shared work queue (ENQCMD with PASID in MSR)
> > > > and dedicated work queue (PASID in device register) to a guest
> > > > process which is associated to a gPASID. Say the host kernel has setup
> > > > the hPASID entry with nested translation though /dev/ioasid. For
> > > > shared work queue the CPU is configured to translate gPASID in MSR
> > > > into **hPASID** before the payload goes out to the wire. However
> > > > for dedicated work queue the device MMIO register is directly mapped
> > > > to and programmed by the guest, thus containing a **gPASID** value
> > > > implying DMA requests through this interface will hit IOMMU faults
> > > > due to invalid gPASID entry. Having gPASID==hPASID is a simple
> > > > workaround here. mdev doesn't have this problem because the
> > > > PASID register is in emulated control-path thus can be translated
> > > > to hPASID manually by mdev driver.
> > >
> > > This all must be explicit too.
> > >
> > > If a PASID is allocated and it is going to be used with ENQCMD then
> > > everything needs to know it is actually quite different than a PASID
> > > that was allocated to be used with a normal SRIOV device, for
> > > instance.
> > >
> > > The former case can accept that the guest PASID is virtualized, while
> > > the lattter can not.
> > >
> > > This is also why PASID per RID has to be an option. When I assign a
> > > full SRIOV function to the guest then that entire RID space needs to
> > > also be assigned to the guest. Upon migration I need to take all the
> > > physical PASIDs and rebuild them in another hypervisor exactly as is.
> > >
> > > If you force all RIDs into a global PASID pool then normal SRIOV
> > > migration w/PASID becomes impossible. ie ENQCMD breaks everything
> else
> > > that should work.
> > >
> > > This is why you need to sort all this out and why it feels like some
> > > of the specs here have been mis-designed.
> > >
> > > I'm not sure carving out ranges is really workable for migration.
> > >
> > > I think the real answer is to carve out entire RIDs as being in the
> > > global pool or not. Then the ENQCMD HW can be bundled together and
> > > everything else can live in the natural PASID per RID world.
> > >
> >
> > OK. Here is the revised scheme by making it explicitly.
> >
> > There are three scenarios to be considered:
> >
> > 1) SR-IOV (AMD/ARM):
> > - "PASID per RID" with guest-allocated PASIDs;
> > - PASID table managed by guest (in GPA space);
> > - the entire PASID space delegated to guest;
> > - no need to explicitly register guest-allocated PASIDs to host;
> > - uAPI for attaching PASID table:
> >
> > // set to "PASID per RID"
> > ioctl(ioasid_fd, IOASID_SET_HWID_MODE, IOASID_HWID_LOCAL);
> >
> > // When Qemu captures a new PASID table through vIOMMU;
> > pasidtbl_ioasid = ioctl(ioasid_fd, IOASID_ALLOC);
> > ioctl(device_fd, VFIO_ATTACH_IOASID, pasidtbl_ioasid);
> >
> > // Set the PASID table to the RID associated with pasidtbl_ioasid;
> > ioctl(ioasid_fd, IOASID_SET_PASID_TABLE, pasidtbl_ioasid, gpa_addr);
> >
> > 2) SR-IOV, no ENQCMD (Intel):
> > - "PASID per RID" with guest-allocated PASIDs;
> > - PASID table managed by host (in HPA space);
> > - the entire PASID space delegated to guest too;
> > - host must be explicitly notified for guest-allocated PASIDs;
> > - uAPI for binding user-allocated PASIDs:
> >
> > // set to "PASID per RID"
> > ioctl(ioasid_fd, IOASID_SET_HWID_MODE, IOASID_HWID_LOCAL);
> >
> > // When Qemu captures a new PASID allocated through vIOMMU;
>
> Is this achieved by VCMD or by capturing guest's PASID cache invalidation?
The latter one
>
> > pgtbl_ioasid = ioctl(ioasid_fd, IOASID_ALLOC);
> > ioctl(device_fd, VFIO_ATTACH_IOASID, pgtbl_ioasid);
> >
> > // Tell the kernel to associate pasid to pgtbl_ioasid in internal structure;
> > // &pasid being a pointer due to a requirement in scenario-3
> > ioctl(ioasid_fd, IOASID_SET_HWID, pgtbl_ioasid, &pasid);
> >
> > // Set guest page table to the RID+pasid associated to pgtbl_ioasid
> > ioctl(ioasid_fd, IOASID_BIND_PGTABLE, pgtbl_ioasid, gpa_addr);
> >
> > 3) SRIOV, ENQCMD (Intel):
> > - "PASID global" with host-allocated PASIDs;
> > - PASID table managed by host (in HPA space);
> > - all RIDs bound to this ioasid_fd use the global pool;
> > - however, exposing global PASID into guest breaks migration;
> > - hybrid scheme: split local PASID range and global PASID range;
> > - force guest to use only local PASID range (through vIOMMU);
> > - for ENQCMD, configure CPU to translate local->global;
> > - for non-ENQCMD, setup both local/global pasid entries;
> > - uAPI for range split and CPU pasid mapping:
> >
> > // set to "PASID global"
> > ioctl(ioasid_fd, IOASID_SET_HWID_MODE, IOASID_HWID_GLOBAL);
> >
> > // split local/global range, applying to all RIDs in this fd
> > // Example: local [0, 1024), global [1024, max)
> > // local PASID range is managed by guest and migrated as VM state
> > // global PASIDs are re-allocated and mapped to local PASIDs post
> migration
> > ioctl(ioasid_fd, IOASID_HWID_SET_GLOBAL_MIN, 1024);
> >
> > // When Qemu captures a new local_pasid allocated through vIOMMU;
> > pgtbl_ioasid = ioctl(ioasid_fd, IOASID_ALLOC);
> > ioctl(device_fd, VFIO_ATTACH_IOASID, pgtbl_ioasid);
> >
> > // Tell the kernel to associate local_pasid to pgtbl_ioasid in internal
> structure;
> > // Due to HWID_GLOBAL, the kernel also allocates a global_pasid from
> the
> > // global pool. From now on, every hwid related operations must be
> applied
> > // to both PASIDs for this page table;
> > // global_pasid is returned to userspace in the same field as local_pasid;
> > ioctl(ioasid_fd, IOASID_SET_HWID, pgtbl_ioasid, &local_pasid);
> >
> > // Qemu then registers local_pasid/global_pasid pair to KVM for setting
> up
> > // CPU PASID translation table;
> > ioctl(kvm_fd, KVM_SET_PASID_MAPPING, local_pasid, global_pasid);
> >
> > // Set guest page table to the RID+{local_pasid, global_pasid} associated
> > // to pgtbl_ioasid;
> > ioctl(ioasid_fd, IOASID_BIND_PGTABLE, pgtbl_ioasid, gpa_addr);
> >
> > -----
> > Notes:
> >
> > I tried to keep common commands in generic format for all scenarios, while
> > introducing new commands for usage-specific requirement. Everything is
> > made explicit now.
> >
> > The userspace has sufficient information to choose its desired scheme
> based
> > on vIOMMU types and platform information (e.g. whether ENQCMD is
> exposed
> > in virtual CPUID, whether assigned devices support DMWr, etc.).
> >
> > Above example assumes one RID per bound page table, because vIOMMU
> > identifies new guest page tables per-RID. If there are other usages requiring
> > multiple RIDs per page table, SET_HWID/BIND_PGTABLE could accept
> > another device_handle parameter to specify which RID is targeted for this
> > operation.
> >
> > When considering SIOV/mdev there is no change to above uAPI sequence.
> > It's n/a for 1) as SIOV requires PASID table in HPA space, nor does it
> > cause any change to 3) regarding to the split range scheme. The only
> > conceptual change is in 2), where although it's still "PASID per RID" the
> > PASIDs must be managed by host because the parent driver also allocates
> > PASIDs from per-RID space to mark mdev (RID+PASID). But this difference
> > doesn't change the uAPI flow - just treat user-provisioned PASID as 'virtual'
> > and then allocate a 'real' PASID at IOASID_SET_HWID. Later always use the
> > real one when programming PASID entry (IOASID_BIND_PGTABLE) or
> device
> > PASID register (converted in the mediation path).
> >
> > If all above can work reasonably, we even don't need the special VCMD
> > interface in VT-d for guest to allocate PASIDs from host. Just always let
> > the guest to manage its PASIDs (with restriction of available local PASIDs),
> > being a global allocator or per-RID allocator. the vIOMMU side just stick
> > to the per-RID emulation according to the spec.
>
> yeah, if this scheme for scenario 3) is good. We may limit the range of
> local PASIDs by limiting the PASID bit width of vIOMMU. QEMU can get the
> local PASID allocated by guest IOMMU when guest does PASID cache
> invalidation.
>
> --
> Regards,
> Yi Liu
On Tue, May 11, 2021 at 10:51:40PM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Tuesday, May 11, 2021 10:39 PM
> >
> > On Tue, May 11, 2021 at 09:10:03AM +0000, Tian, Kevin wrote:
> >
> > > 3) SRIOV, ENQCMD (Intel):
> > > - "PASID global" with host-allocated PASIDs;
> > > - PASID table managed by host (in HPA space);
> > > - all RIDs bound to this ioasid_fd use the global pool;
> > > - however, exposing global PASID into guest breaks migration;
> > > - hybrid scheme: split local PASID range and global PASID range;
> > > - force guest to use only local PASID range (through vIOMMU);
> > > - for ENQCMD, configure CPU to translate local->global;
> > > - for non-ENQCMD, setup both local/global pasid entries;
> > > - uAPI for range split and CPU pasid mapping:
> > >
> > > // set to "PASID global"
> > > ioctl(ioasid_fd, IOASID_SET_HWID_MODE, IOASID_HWID_GLOBAL);
> > >
> > > // split local/global range, applying to all RIDs in this fd
> > > // Example: local [0, 1024), global [1024, max)
> > > // local PASID range is managed by guest and migrated as VM state
> > > // global PASIDs are re-allocated and mapped to local PASIDs post
> > migration
> > > ioctl(ioasid_fd, IOASID_HWID_SET_GLOBAL_MIN, 1024);
> >
> > I'm still not sold that ranges are the best idea here, it just adds
> > more state that has to match during migration. Keeping the
> > global/local split per RID seems much cleaner to me
>
> With ENQCMD the PASID is kept in CPU MSR, making it a process
> context within the guest. When a guest process is bound to two
> devices, the same local PASID must be usable on both devices.
> Having per RID split cannot guarantee it.
That is only for ENQCMD. All drivers know if they are ENQCMD
compatible drivers and can ensure they use the global allocator
consistently for their RIDs.
Basically each RID knows based on its kernel drivers if it is a local
or global RID and the ioasid knob can further fine tune this for any
other specialty cases.
> > It does need some user visible difference because SIOV/mdev is not
> > migratable. Only the kernel can select a PASID, userspace (and hence
> > the guest) shouldn't have the option to force a specific PASID as the
> > PASID space is shared across the entire RID to all VMs using the mdev.
>
> not migratable only when you choose exposing host-allocated PASID
> into guest. However in the entire this proposal we actually virtualize
> PASIDs, letting the guest manage its own PASID space in all
> scenarios
PASID cannot be virtualized without also using ENQCMD.
A mdev that is using PASID without ENQCMD is non-migratable and this
needs to be make visiable in the uAPI.
Jason
> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, May 12, 2021 7:40 AM
>
> On Tue, May 11, 2021 at 10:51:40PM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Tuesday, May 11, 2021 10:39 PM
> > >
> > > On Tue, May 11, 2021 at 09:10:03AM +0000, Tian, Kevin wrote:
> > >
> > > > 3) SRIOV, ENQCMD (Intel):
> > > > - "PASID global" with host-allocated PASIDs;
> > > > - PASID table managed by host (in HPA space);
> > > > - all RIDs bound to this ioasid_fd use the global pool;
> > > > - however, exposing global PASID into guest breaks migration;
> > > > - hybrid scheme: split local PASID range and global PASID range;
> > > > - force guest to use only local PASID range (through vIOMMU);
> > > > - for ENQCMD, configure CPU to translate local->global;
> > > > - for non-ENQCMD, setup both local/global pasid entries;
> > > > - uAPI for range split and CPU pasid mapping:
> > > >
> > > > // set to "PASID global"
> > > > ioctl(ioasid_fd, IOASID_SET_HWID_MODE, IOASID_HWID_GLOBAL);
> > > >
> > > > // split local/global range, applying to all RIDs in this fd
> > > > // Example: local [0, 1024), global [1024, max)
> > > > // local PASID range is managed by guest and migrated as VM state
> > > > // global PASIDs are re-allocated and mapped to local PASIDs post
> > > migration
> > > > ioctl(ioasid_fd, IOASID_HWID_SET_GLOBAL_MIN, 1024);
> > >
> > > I'm still not sold that ranges are the best idea here, it just adds
> > > more state that has to match during migration. Keeping the
> > > global/local split per RID seems much cleaner to me
> >
> > With ENQCMD the PASID is kept in CPU MSR, making it a process
> > context within the guest. When a guest process is bound to two
> > devices, the same local PASID must be usable on both devices.
> > Having per RID split cannot guarantee it.
>
> That is only for ENQCMD. All drivers know if they are ENQCMD
> compatible drivers and can ensure they use the global allocator
> consistently for their RIDs.
>
> Basically each RID knows based on its kernel drivers if it is a local
> or global RID and the ioasid knob can further fine tune this for any
> other specialty cases.
It's fine if you insist on this way. Then we leave it to userspace to
ensure same split range is used across devices when vIOMMU is
concerned. Please note such range split has to be enforced through
vIOMMU which (e.g. on VT-d) includes a register to report available
PASID space size (applying to all devices behind this vIOMMU) to
the guest. The kernel just follows per-RID split info. If anything broken,
the userspace just shoots its own foot.
>
> > > It does need some user visible difference because SIOV/mdev is not
> > > migratable. Only the kernel can select a PASID, userspace (and hence
> > > the guest) shouldn't have the option to force a specific PASID as the
> > > PASID space is shared across the entire RID to all VMs using the mdev.
> >
> > not migratable only when you choose exposing host-allocated PASID
> > into guest. However in the entire this proposal we actually virtualize
> > PASIDs, letting the guest manage its own PASID space in all
> > scenarios
>
> PASID cannot be virtualized without also using ENQCMD.
>
> A mdev that is using PASID without ENQCMD is non-migratable and this
> needs to be make visiable in the uAPI.
>
No. without ENQCMD the PASID must be programmed to a mdev MMIO
register. This operation is mediated then mdev driver can translate the
PASID from virtual to real.
Thanks
Kevin
On Wed, May 12, 2021 at 12:21:24AM +0000, Tian, Kevin wrote:
> > Basically each RID knows based on its kernel drivers if it is a local
> > or global RID and the ioasid knob can further fine tune this for any
> > other specialty cases.
>
> It's fine if you insist on this way. Then we leave it to userspace to
> ensure same split range is used across devices when vIOMMU is
> concerned.
I'm still confused why there is a split range needed.
> Please note such range split has to be enforced through
> vIOMMU which (e.g. on VT-d) includes a register to report available
> PASID space size (applying to all devices behind this vIOMMU) to
> the guest. The kernel just follows per-RID split info. If anything broken,
> the userspace just shoots its own foot.
Is it because this specific vIOMMU protocol is limiting things?
> > > > It does need some user visible difference because SIOV/mdev is not
> > > > migratable. Only the kernel can select a PASID, userspace (and hence
> > > > the guest) shouldn't have the option to force a specific PASID as the
> > > > PASID space is shared across the entire RID to all VMs using the mdev.
> > >
> > > not migratable only when you choose exposing host-allocated PASID
> > > into guest. However in the entire this proposal we actually virtualize
> > > PASIDs, letting the guest manage its own PASID space in all
> > > scenarios
> >
> > PASID cannot be virtualized without also using ENQCMD.
> >
> > A mdev that is using PASID without ENQCMD is non-migratable and this
> > needs to be make visiable in the uAPI.
>
> No. without ENQCMD the PASID must be programmed to a mdev MMIO
> register. This operation is mediated then mdev driver can translate the
> PASID from virtual to real.
That is probably unworkable with real devices, but if you do this you
need to explicitly expose the vPASID to the mdev API somehow, and still
the device needs to declare if it supports this, and devices that
don't should still work in a non-migratable mode.
Jason
> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, May 12, 2021 8:25 AM
>
> On Wed, May 12, 2021 at 12:21:24AM +0000, Tian, Kevin wrote:
>
> > > Basically each RID knows based on its kernel drivers if it is a local
> > > or global RID and the ioasid knob can further fine tune this for any
> > > other specialty cases.
> >
> > It's fine if you insist on this way. Then we leave it to userspace to
> > ensure same split range is used across devices when vIOMMU is
> > concerned.
>
> I'm still confused why there is a split range needed.
a device could support both ENQCMD and non-ENQCMD submissions.
for ENQCMD path, CPU provides a PASID translation mechanism (from
guest PASID to host PASID)
for non-ENQCMD path, guest driver directly programs untranslated
guest PASID to the device MMIO register.
the host kernel only setups host PASID entry which is hwid for a said
ioasid page table.
If we don't split range, we have to assume guest PASID == host PASID
otherwise non-ENQCMD path will fail. But expose host PASID to guest
breaks migration.
If we want to allow migration, then need support guest PASID != host
PASID and make sure both entries point to the same page table so
ENQCMD (host PASID) and non-ENQCMD (guest PASID) can both work.
It requires range split to avoid conflict between host/guest PASIDs
in the same space.
>
> > Please note such range split has to be enforced through
> > vIOMMU which (e.g. on VT-d) includes a register to report available
> > PASID space size (applying to all devices behind this vIOMMU) to
> > the guest. The kernel just follows per-RID split info. If anything broken,
> > the userspace just shoots its own foot.
>
> Is it because this specific vIOMMU protocol is limiting things?
When range split is enabled, we need a way to tell the guest about the
local range size so guest PASID is allocated only within this range. Then
we use vIOMMU to expose such information.
>
> > > > > It does need some user visible difference because SIOV/mdev is not
> > > > > migratable. Only the kernel can select a PASID, userspace (and hence
> > > > > the guest) shouldn't have the option to force a specific PASID as the
> > > > > PASID space is shared across the entire RID to all VMs using the mdev.
> > > >
> > > > not migratable only when you choose exposing host-allocated PASID
> > > > into guest. However in the entire this proposal we actually virtualize
> > > > PASIDs, letting the guest manage its own PASID space in all
> > > > scenarios
> > >
> > > PASID cannot be virtualized without also using ENQCMD.
> > >
> > > A mdev that is using PASID without ENQCMD is non-migratable and this
> > > needs to be make visiable in the uAPI.
> >
> > No. without ENQCMD the PASID must be programmed to a mdev MMIO
> > register. This operation is mediated then mdev driver can translate the
> > PASID from virtual to real.
>
> That is probably unworkable with real devices, but if you do this you
> need to explicitly expose the vPASID to the mdev API somehow, and still
> the device needs to declare if it supports this, and devices that
> don't should still work in a non-migratable mode.
>
It's not necessary. For real devices we use alias mapping for both guest/host
PASID as explained above. Then we can have the guest to always register its
vPASID with ioasid (just like map/unmap GPA to HVA), and then let host drivers
to figure out whether that vPASID can be used as a real hwid. When it's considered
virtual and a real hwid is allocated by the host, the mapping is saved under this
ioasid to be queried by device drivers if translation is required.
From this angle, the previous IOASID_SET_HWID possibly should be renamed
to IOASID_SET_USER_HWID.
Thanks
Kevin
On Mon, May 03, 2021 at 01:15:18PM -0300, Jason Gunthorpe wrote:
> On Thu, Apr 29, 2021 at 01:04:05PM +1000, David Gibson wrote:
> > Again, I don't know enough about VDPA to make sense of that. Are we
> > essentially talking non-PCI virtual devices here? In which case you
> > could define the VDPA "bus" to always have one-device groups.
>
> It is much worse than that.
>
> What these non-PCI devices need is for the kernel driver to be part of
> the IOMMU group of the underlying PCI device but tell VFIO land that
> "groups don't matter"
I don't really see a semantic distinction between "always one-device
groups" and "groups don't matter". Really the only way you can afford
to not care about groups is if they're singletons.
> Today mdev tries to fake this by using singleton iommu groups, but it
> is really horrible and direcly hacks up the VFIO IOMMU code to
> understand these special cases. Intel was proposing more special
> hacking in the VFIO IOMMU code to extend this to PASID.
At this stage I don't really understand why that would end up so
horrible.
> When we get to a /dev/ioasid this is all nonsense. The kernel device
> driver is going to have to tell drivers/iommu exactly what kind of
> ioasid it can accept, be it a PASID inside a kernel owned group, a SW
> emulated 'mdev' ioasid, or whatever.
>
> In these cases the "group" idea has become a fiction that just creates
> a pain.
I don't see how the group is a fiction in this instance. You can
still have devices that can't be isolated, therefore you can have
non-singleton groups.
> "Just reorganize VDPA to do something insane with the driver
> core so we can create a dummy group to satisfy an unnecessary uAPI
> restriction" is not a very compelling argument.
>
> So if the nonsensical groups goes away for PASID/mdev, where does it
> leave the uAPI in other cases?
>
> > I don't think simplified-but-wrong is a good goal. The thing about
> > groups is that if they're there, you can't just "not care" about them,
> > they affect you whether you like it or not.
>
> You really can. If one thing claims the group then all the other group
> devices become locked out.
Aside: I'm primarily using "group" to mean the underlying hardware
unit, not the vfio construct on top of it, I'm not sure that's been
clear throughout.
So.. your model assumes that every device has a safe quiescent state
where it won't do any harm until poked, whether its group is
currently kernel owned, or owned by a userspace that doesn't know
anything about it.
At minimum this does mean that in order to use one device in the group
you must have permission to use *all* the devices in the group -
otherwise you may be able to operate a device you don't have
permission to by DMAing to its registers from a device you do have
permission to.
Whatever scripts are managing ownership of devices also need to know
about groups, because they need to put all the devices into that
quiescent state before the group can change ownership.
> The main point to understand is that groups are NOT an application
> restriction! It is a whole system restriction that the operator needs
> to understand and deal with. This is why things like dpdk don't care
> about the group at all - there is nothing they can do with the
> information.
>
> If the operator says to run dpdk on a specific device then the
> operator is the one that has to deal with all the other devices in the
> group getting locked out.
Ok, I think I see your point there.
> At best the application can make it more obvious that the operator is
> doing something dangerous, but the current kernel API doesn't seem to
> really support that either.
>
> Jason
>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Tue, May 04, 2021 at 03:15:37PM -0300, Jason Gunthorpe wrote:
> On Tue, May 04, 2021 at 01:54:55PM +1000, David Gibson wrote:
> > On Mon, May 03, 2021 at 01:05:30PM -0300, Jason Gunthorpe wrote:
> > > On Thu, Apr 29, 2021 at 01:20:22PM +1000, David Gibson wrote:
> > > > > There is a certain appeal to having some
> > > > > 'PPC_TCE_CREATE_SPECIAL_IOASID' entry point that has a wack of extra
> > > > > information like windows that can be optionally called by the viommu
> > > > > driver and it remains well defined and described.
> > > >
> > > > Windows really aren't ppc specific. They're absolutely there on x86
> > > > and everything else as well - it's just that people are used to having
> > > > a window at 0..<something largish> that you can often get away with
> > > > treating it sloppily.
> > >
> > > My point is this detailed control seems to go on to more than just
> > > windows. As you say the vIOMMU is emulating specific HW that needs to
> > > have kernel interfaces to match it exactly.
> >
> > It's really not that bad. The case of emulating the PAPR vIOMMU on
> > something else is relatively easy, because all updates to the IO page
> > tables go through hypercalls. So, as long as the backend IOMMU can
> > map all the IOVAs that the guest IOMMU can, then qemu's implementation
> > of those hypercalls just needs to put an equivalent mapping in the
> > backend, which it can do with a generic VFIO_DMA_MAP.
>
> So you also want the PAPR vIOMMU driver to run on, say, an ARM IOMMU?
Well, I don't want to preclude it in the API. I'm not sure about that
specific example, but in most cases it should be possible to run the
PAPR vIOMMU on an x86 IOMMU backend. Obviously only something you'd
want to do for testing and experimentation, but it could be quite
useful for that.
> > vIOMMUs with page tables in guest memory are harder, but only really
> > in the usual ways that a vIOMMU of that type is harder (needs cache
> > mode or whatever). At whatever point you need to shadow from the
> > guest IO page tables to the host backend, you can again do that with
> > generic maps, as long as the backend supports the necessary IOVAs, and
> > has an IO page size that's equal to or a submultiple of the vIOMMU
> > page size.
>
> But this definitely all becomes HW specific.
>
> For instance I want to have an ARM vIOMMU driver it needs to do some
>
> ret = ioctl(ioasid_fd, CREATE_NESTED_IOASID, [page table format is ARMvXXX])
> if (ret == -EOPNOTSUPP)
> ret = ioctl(ioasid_fd, CREATE_NORMAL_IOASID, ..)
> // and do completely different and more expensive emulation
>
> I can get a little bit of generality, but at the end of the day the
> IOMMU must create a specific HW layout of the nested page table, if it
> can't, it can't.
Erm.. I don't really know how your IOASID interface works here. I'm
thinking about the VFIO interface where maps and unmaps are via
explicit ioctl()s, which provides an obvious point to do translation
between page table formats.
But.. even if you're exposing page tables to userspace.. with hardware
that has explicit support for nesting you can probably expose the hw
tables directly which is great for the cases that works for. But
surely for older IOMMUs which don't do nesting you must have some way
of shadowing guest IO page tables to host IO page tables to translate
GPA to HPA at least? If you're doing that, I don't see that
converting page table format is really any harder
> > > I'm remarking that trying to unify every HW IOMMU implementation that
> > > ever has/will exist into a generic API complete enough to allow the
> > > vIOMMU to be created is likely to result in an API too complicated to
> > > understand..
> >
> > Maybe not every one, but I think we can get a pretty wide range with a
> > reasonable interface.
>
> It sounds like a reasonable guideline is if the feature is actually
> general to all IOMMUs and can be used by qemu as part of a vIOMMU
> emulation when compatible vIOMMU HW is not available.
>
> Having 'requested window' support that isn't actually implemented in
> every IOMMU is going to mean the PAPR vIOMMU emulation won't work,
> defeating the whole point of making things general?
The trick is that you don't necessarily need dynamic window support in
the backend to emulate it in the vIOMMU. If your backend has fixed
windows, then you emulate request window as:
if (requested window is within backend windows)
no-op;
else
return ERROR;
It might not be a theoretically complete emulation of the vIOMMU, but
it can support in-practice usage. In particular it works pretty well
if your backend has a nice big IOVA range (like x86 IOMMUS) but your
guest platform typically uses relatively small IOVA windows. PAPR on
x86 is exactly that... well.. possibly not the 64-bit window, but
because of old PAPR platforms that didn't support that, we can choose
not to advertise that and guests will cope.
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Wed, May 05, 2021 at 01:39:02PM -0300, Jason Gunthorpe wrote:
> On Wed, May 05, 2021 at 02:28:53PM +1000, Alexey Kardashevskiy wrote:
>
> > This is a good feature in general when let's say there is a linux supported
> > device which has a proprietary device firmware update tool which only exists
> > as an x86 binary and your hardware is not x86 - running qemu + vfio in full
> > emulation would provide a way to run the tool to update a physical device.
>
> That specific use case doesn't really need a vIOMMU though, does it?
Possibly not, but the mechanics needed to do vIOMMU on different host
IOMMU aren't really different from what you need for a no-vIOMMU
guest. With a vIOMMU you need to map guest IOVA space into the host
IOVA space. With no no-vIOMMU you need to map guest physical
addresses into the host IOVA space. In either case the GPA/gIOVA to
userspace and userspace to HPA mappings are basically arbitrary.
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
> From: David Gibson <[email protected]>
> Sent: Thursday, May 13, 2021 2:01 PM
> >
> > But this definitely all becomes HW specific.
> >
> > For instance I want to have an ARM vIOMMU driver it needs to do some
> >
> > ret = ioctl(ioasid_fd, CREATE_NESTED_IOASID, [page table format is
> ARMvXXX])
> > if (ret == -EOPNOTSUPP)
> > ret = ioctl(ioasid_fd, CREATE_NORMAL_IOASID, ..)
> > // and do completely different and more expensive emulation
> >
> > I can get a little bit of generality, but at the end of the day the
> > IOMMU must create a specific HW layout of the nested page table, if it
> > can't, it can't.
>
> Erm.. I don't really know how your IOASID interface works here. I'm
> thinking about the VFIO interface where maps and unmaps are via
> explicit ioctl()s, which provides an obvious point to do translation
> between page table formats.
>
We are working on a draft uAPI proposal based on the discussions in this
thread. expect to send it out next week.
Thanks
Kevin
On Thu, May 13, 2021 at 04:01:20PM +1000, David Gibson wrote:
> But.. even if you're exposing page tables to userspace.. with hardware
> that has explicit support for nesting you can probably expose the hw
> tables directly which is great for the cases that works for. But
> surely for older IOMMUs which don't do nesting you must have some way
> of shadowing guest IO page tables to host IO page tables to translate
> GPA to HPA at least?
I expect this would be in quemu and would be part of the expensive
emulation I suggested. Converting the guest's page table structure
into a sequence of map/unmaps to a non-nestable IOASID.
> If you're doing that, I don't see that converting page table format
> is really any harder
It isn't, but it is a completely different flow and custom from the
normal HW accelerated nesting.
> It might not be a theoretically complete emulation of the vIOMMU, but
> it can support in-practice usage. In particular it works pretty well
> if your backend has a nice big IOVA range (like x86 IOMMUS) but your
> guest platform typically uses relatively small IOVA windows. PAPR on
> x86 is exactly that... well.. possibly not the 64-bit window, but
> because of old PAPR platforms that didn't support that, we can choose
> not to advertise that and guests will cope.
So maybe this multi-window thing is generic API somehow. You'll have
to check what Kevin comes up with to ensure it fits in
Jason
On Thu, May 13, 2021 at 04:07:07PM +1000, David Gibson wrote:
> On Wed, May 05, 2021 at 01:39:02PM -0300, Jason Gunthorpe wrote:
> > On Wed, May 05, 2021 at 02:28:53PM +1000, Alexey Kardashevskiy wrote:
> >
> > > This is a good feature in general when let's say there is a linux supported
> > > device which has a proprietary device firmware update tool which only exists
> > > as an x86 binary and your hardware is not x86 - running qemu + vfio in full
> > > emulation would provide a way to run the tool to update a physical device.
> >
> > That specific use case doesn't really need a vIOMMU though, does it?
>
> Possibly not, but the mechanics needed to do vIOMMU on different host
> IOMMU aren't really different from what you need for a no-vIOMMU
> guest.
For very simple vIOMMUs this might be true, but this new features of nesting
PASID, migration, etc, etc all make the vIOMMU complicated and
emuluating it completely alot harder.
Stuffing a vfio-pci into a guest and creating a physical map using a
single IOASID is comparably trivial.
Jason
On Thu, May 13, 2021 at 03:48:19PM +1000, David Gibson wrote:
> On Mon, May 03, 2021 at 01:15:18PM -0300, Jason Gunthorpe wrote:
> > On Thu, Apr 29, 2021 at 01:04:05PM +1000, David Gibson wrote:
> > > Again, I don't know enough about VDPA to make sense of that. Are we
> > > essentially talking non-PCI virtual devices here? In which case you
> > > could define the VDPA "bus" to always have one-device groups.
> >
> > It is much worse than that.
> >
> > What these non-PCI devices need is for the kernel driver to be part of
> > the IOMMU group of the underlying PCI device but tell VFIO land that
> > "groups don't matter"
>
> I don't really see a semantic distinction between "always one-device
> groups" and "groups don't matter". Really the only way you can afford
> to not care about groups is if they're singletons.
The kernel driver under the mdev may not be in an "always one-device"
group.
It is a kernel driver so the only thing we know and care about is that
all devices in the HW group are bound to kernel drivers.
The vfio device that spawns from this kernel driver is really a
"groups don't matter" vfio device because at the IOMMU layer it should
be riding on the physical group of the kernel driver. At the VFIO
layer we no longer care about the group abstraction because the system
guarentees isolation in some other way.
The issue is a software one of tightly coupling IOMMU HW groups to
VFIO's API and then introducing an entire class of VFIO mdev devices
that no longer care about IOMMU HW groups at all.
Currently mdev tries to trick this by creating singleton groups, but
it is very ugly and very tightly coupled to a specific expectation of
the few existing mdev drivers. Trying to add PASID made it alot worse.
> Aside: I'm primarily using "group" to mean the underlying hardware
> unit, not the vfio construct on top of it, I'm not sure that's been
> clear throughout.
Sure, that is obviously fixed, but I'm not interested in that.
I'm interested in having a VFIO API that makes sense for vfio-pci
which has a tight coupling to the HW notion of a IOMMU and also vfio
mdev's that have no concept of a HW IOMMU group.
> So.. your model assumes that every device has a safe quiescent state
> where it won't do any harm until poked, whether its group is
> currently kernel owned, or owned by a userspace that doesn't know
> anything about it.
This is today's model, yes. When you run dpdk on a multi-group device
vfio already ensures that all the device groups remained parked and
inaccessible.
> At minimum this does mean that in order to use one device in the group
> you must have permission to use *all* the devices in the group -
> otherwise you may be able to operate a device you don't have
> permission to by DMAing to its registers from a device you do have
> permission to.
If the administator configures the system with different security
labels for different VFIO devices then yes removing groups makes this
more tricky as all devices in the group should have the same label.
Jason
On Thu, May 13, 2021 at 10:50:30AM -0300, Jason Gunthorpe wrote:
> On Thu, May 13, 2021 at 04:07:07PM +1000, David Gibson wrote:
> > On Wed, May 05, 2021 at 01:39:02PM -0300, Jason Gunthorpe wrote:
> > > On Wed, May 05, 2021 at 02:28:53PM +1000, Alexey Kardashevskiy wrote:
> > >
> > > > This is a good feature in general when let's say there is a linux supported
> > > > device which has a proprietary device firmware update tool which only exists
> > > > as an x86 binary and your hardware is not x86 - running qemu + vfio in full
> > > > emulation would provide a way to run the tool to update a physical device.
> > >
> > > That specific use case doesn't really need a vIOMMU though, does it?
> >
> > Possibly not, but the mechanics needed to do vIOMMU on different host
> > IOMMU aren't really different from what you need for a no-vIOMMU
> > guest.
>
> For very simple vIOMMUs this might be true, but this new features of nesting
> PASID, migration, etc, etc all make the vIOMMU complicated and
> emuluating it completely alot harder.
Well, sure, emulating a complex vIOMMU is complex. But "very simple
vIOMMUs" covers the vast majority of currently deployed hardware, and
several are already emulated by qemu.
> Stuffing a vfio-pci into a guest and creating a physical map using a
> single IOASID is comparably trivial.
Note that for PAPR (POWER guest) systems this is not an option: the
PAPR platform *always* has a vIOMMU.
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Thu, May 13, 2021 at 10:59:38AM -0300, Jason Gunthorpe wrote:
> On Thu, May 13, 2021 at 03:48:19PM +1000, David Gibson wrote:
> > On Mon, May 03, 2021 at 01:15:18PM -0300, Jason Gunthorpe wrote:
> > > On Thu, Apr 29, 2021 at 01:04:05PM +1000, David Gibson wrote:
> > > > Again, I don't know enough about VDPA to make sense of that. Are we
> > > > essentially talking non-PCI virtual devices here? In which case you
> > > > could define the VDPA "bus" to always have one-device groups.
> > >
> > > It is much worse than that.
> > >
> > > What these non-PCI devices need is for the kernel driver to be part of
> > > the IOMMU group of the underlying PCI device but tell VFIO land that
> > > "groups don't matter"
> >
> > I don't really see a semantic distinction between "always one-device
> > groups" and "groups don't matter". Really the only way you can afford
> > to not care about groups is if they're singletons.
>
> The kernel driver under the mdev may not be in an "always one-device"
> group.
I don't really understand what you mean by that.
> It is a kernel driver so the only thing we know and care about is that
> all devices in the HW group are bound to kernel drivers.
>
> The vfio device that spawns from this kernel driver is really a
> "groups don't matter" vfio device because at the IOMMU layer it should
> be riding on the physical group of the kernel driver. At the VFIO
> layer we no longer care about the group abstraction because the system
> guarentees isolation in some other way.
Uh.. I don't really know how mdevs are isolated from each other. I
thought it was because the physical device providing the mdevs
effectively had an internal IOMMU (or at least DMA permissioning) to
isolate the mdevs, even though the physical device may not be fully
isolated.
In that case the virtual mdev is effectively in a singleton group,
which is different from the group of its parent device.
If the physical device had a bug which meant the mdevs *weren't*
properly isolated from each other, then those mdevs would share a
group, and you *would* care about it. Depending on how the isolation
failed the mdevs might or might not also share a group with the parent
physical device.
> The issue is a software one of tightly coupling IOMMU HW groups to
> VFIO's API and then introducing an entire class of VFIO mdev devices
> that no longer care about IOMMU HW groups at all.
The don't necessarily care about the IOMMU groups of the parent
physical hardware, but they have their own IOMMU groups as virtual
hardware devices.
> Currently mdev tries to trick this by creating singleton groups, but
> it is very ugly and very tightly coupled to a specific expectation of
> the few existing mdev drivers. Trying to add PASID made it alot worse.
>
> > Aside: I'm primarily using "group" to mean the underlying hardware
> > unit, not the vfio construct on top of it, I'm not sure that's been
> > clear throughout.
>
> Sure, that is obviously fixed, but I'm not interested in that.
>
> I'm interested in having a VFIO API that makes sense for vfio-pci
> which has a tight coupling to the HW notion of a IOMMU and also vfio
> mdev's that have no concept of a HW IOMMU group.
>
> > So.. your model assumes that every device has a safe quiescent state
> > where it won't do any harm until poked, whether its group is
> > currently kernel owned, or owned by a userspace that doesn't know
> > anything about it.
>
> This is today's model, yes. When you run dpdk on a multi-group device
> vfio already ensures that all the device groups remained parked and
> inaccessible.
I'm not really following what you're saying there.
If you have a multi-device group, and dpdk is using one device in it,
VFIO *does not* (and cannot) ensure that other devices in the group
are parked and inaccessible. It ensures that they're parked at the
moment the group moves from kernel to userspace ownership, but it
can't prevent dpdk from accessing and unparking those devices via peer
to peer DMA.
> > At minimum this does mean that in order to use one device in the group
> > you must have permission to use *all* the devices in the group -
> > otherwise you may be able to operate a device you don't have
> > permission to by DMAing to its registers from a device you do have
> > permission to.
>
> If the administator configures the system with different security
> labels for different VFIO devices then yes removing groups makes this
> more tricky as all devices in the group should have the same label.
That seems a bigger problem than "more tricky". How would you propose
addressing this with your device-first model?
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Mon, May 24, 2021 at 05:52:58PM +1000, David Gibson wrote:
> > > I don't really see a semantic distinction between "always one-device
> > > groups" and "groups don't matter". Really the only way you can afford
> > > to not care about groups is if they're singletons.
> >
> > The kernel driver under the mdev may not be in an "always one-device"
> > group.
>
> I don't really understand what you mean by that.
I mean the group of the mdev's actual DMA device may have multiple
things in it.
> > It is a kernel driver so the only thing we know and care about is that
> > all devices in the HW group are bound to kernel drivers.
> >
> > The vfio device that spawns from this kernel driver is really a
> > "groups don't matter" vfio device because at the IOMMU layer it should
> > be riding on the physical group of the kernel driver. At the VFIO
> > layer we no longer care about the group abstraction because the system
> > guarentees isolation in some other way.
>
> Uh.. I don't really know how mdevs are isolated from each other. I
> thought it was because the physical device providing the mdevs
> effectively had an internal IOMMU (or at least DMA permissioning) to
> isolate the mdevs, even though the physical device may not be fully
> isolated.
>
> In that case the virtual mdev is effectively in a singleton group,
> which is different from the group of its parent device.
That is one way to view it, but it means creating a whole group
infrastructure and abusing the IOMMU stack just to create this
nonsense fiction. We also abuse the VFIO container stuff to hackily
create several different types pf IOMMU uAPIs for the mdev - all of
which are unrelated to drivers/iommu.
Basically, there is no drivers/iommu thing involved, thus is no really
iommu group, for mdev it is all a big hacky lie.
> If the physical device had a bug which meant the mdevs *weren't*
> properly isolated from each other, then those mdevs would share a
> group, and you *would* care about it. Depending on how the isolation
> failed the mdevs might or might not also share a group with the parent
> physical device.
That isn't a real scenario.. mdevs that can't be isolated just
wouldn't be useful to exist
> > This is today's model, yes. When you run dpdk on a multi-group device
> > vfio already ensures that all the device groups remained parked and
> > inaccessible.
>
> I'm not really following what you're saying there.
>
> If you have a multi-device group, and dpdk is using one device in it,
> VFIO *does not* (and cannot) ensure that other devices in the group
> are parked and inaccessible.
I mean in the sense that no other user space can open those devices
and no kernel driver can later be attached to them.
> It ensures that they're parked at the moment the group moves from
> kernel to userspace ownership, but it can't prevent dpdk from
> accessing and unparking those devices via peer to peer DMA.
Right, and adding all this group stuff did nothing to alert the poor
admin that is running DPDK to this risk.
> > If the administator configures the system with different security
> > labels for different VFIO devices then yes removing groups makes this
> > more tricky as all devices in the group should have the same label.
>
> That seems a bigger problem than "more tricky". How would you propose
> addressing this with your device-first model?
You put the same security labels you'd put on the group to the devices
that consitute the group. It is only more tricky in the sense that the
script that would have to do this will need to do more than ID the
group to label but also ID the device members of the group and label
their char nodes.
Jason
On Wed, May 26, 2021 at 12:56:30AM +0530, Kirti Wankhede wrote:
> 2. iommu backed mdev devices for SRIOV where mdev device is created per
> VF (mdev device == VF device) then that mdev device has same iommu
> protection scope as VF associated to it.
This doesn't require, and certainly shouldn't create, a fake group.
Only the VF's real IOMMU group should be used to model an iommu domain
linked to a VF. Injecting fake groups that are proxies for real groups
only opens the possibility of security problems like David is
concerned with.
Max's series approaches this properly by fully linking the struct
pci_device of the VF throughout the entire VFIO scheme, including the
group and container, while still allowing override of various VFIO
operations.
Jason
On 5/26/2021 1:22 AM, Jason Gunthorpe wrote:
> On Wed, May 26, 2021 at 12:56:30AM +0530, Kirti Wankhede wrote:
>
>> 2. iommu backed mdev devices for SRIOV where mdev device is created per
>> VF (mdev device == VF device) then that mdev device has same iommu
>> protection scope as VF associated to it.
>
> This doesn't require, and certainly shouldn't create, a fake group.
>
> Only the VF's real IOMMU group should be used to model an iommu domain
> linked to a VF. Injecting fake groups that are proxies for real groups
> only opens the possibility of security problems like David is
> concerned with.
>
I think this security issue should be addressed by letting mdev device
inherit its parent's iommu_group, i.e. VF's iommu_group here.
Kirti
> Max's series approaches this properly by fully linking the struct
> pci_device of the VF throughout the entire VFIO scheme, including the
> group and container, while still allowing override of various VFIO
> operations.
>
> Jason
>
On Wed, 26 May 2021 00:56:30 +0530
Kirti Wankhede <[email protected]> wrote:
> On 5/25/2021 5:07 AM, Jason Gunthorpe wrote:
> > On Mon, May 24, 2021 at 05:52:58PM +1000, David Gibson wrote:
> >
> >>>> I don't really see a semantic distinction between "always one-device
> >>>> groups" and "groups don't matter". Really the only way you can afford
> >>>> to not care about groups is if they're singletons.
> >>>
> >>> The kernel driver under the mdev may not be in an "always one-device"
> >>> group.
> >>
> >> I don't really understand what you mean by that.
> >
> > I mean the group of the mdev's actual DMA device may have multiple
> > things in it.
> >
> >>> It is a kernel driver so the only thing we know and care about is that
> >>> all devices in the HW group are bound to kernel drivers.
> >>>
> >>> The vfio device that spawns from this kernel driver is really a
> >>> "groups don't matter" vfio device because at the IOMMU layer it should
> >>> be riding on the physical group of the kernel driver. At the VFIO
> >>> layer we no longer care about the group abstraction because the system
> >>> guarentees isolation in some other way.
> >>
> >> Uh.. I don't really know how mdevs are isolated from each other. I
> >> thought it was because the physical device providing the mdevs
> >> effectively had an internal IOMMU (or at least DMA permissioning) to
> >> isolate the mdevs, even though the physical device may not be fully
> >> isolated.
> >>
> >> In that case the virtual mdev is effectively in a singleton group,
> >> which is different from the group of its parent device.
> >
>
> That's correct.
>
> > That is one way to view it, but it means creating a whole group
> > infrastructure and abusing the IOMMU stack just to create this
> > nonsense fiction.
>
> I really didn't get how this abuse the IOMMU stack.
> mdev can be used in 3 different ways:
> 1. non-iommu backed mdev devices where mdev vendor driver takes care to
> DMA map (iommu_map) and isolation is through device hardware internal
> MMU. Here vfio_iommu_type1 module provides a way to validate and pin
> pages required by mdev device for DMA mapping. Then IOMMU mapping is
> done by mdev vendor driver which is owner driver of physical device.
>
> 2. iommu backed mdev devices for SRIOV where mdev device is created per
> VF (mdev device == VF device) then that mdev device has same iommu
> protection scope as VF associated to it. Here mdev device is virtual
> device which uses features of mdev and represents underlying VF device,
> same as vfio-pci but with additional mdev features.
What features would those be? There are no mdev specific parts of the
vfio uAPI.
The mdev device is a virtual device, by why it it virtual in this case?
Aren't we effectively assigning the VF itself (mdev device == VF device)
with a bunch of extra support code to fill in the gaps of the VF
implementing the complete device model in hardware?
We're effectively creating this virtual device, creating a fake IOMMU
group, and trying to create this association of this virtual device to
the real VF in order to shoehorn it into the mdev model. What do we
get from that model other than lifecycle management (ie. type selection)
and re-use of a bunch of code from the driver supporting the 1) model
above?
This specific model seems better served by a device specific peer
driver to vfio-pci (ie. a "vfio-pci variant"). You effectively already
have the code for this driver, it's just in the format of an mdev
driver rather than a vfio "bus driver". The work Jason references
relative to Max aims to make these kinds of drivers easier to implement
through re-use of vfio-pci code.
There are certainly other solutions we could come up with for selecting
a specific device type for a vfio-pci variant driver to implement other
than pretending this model actually belongs in mdev, right? Thanks,
Alex
On 5/25/2021 5:07 AM, Jason Gunthorpe wrote:
> On Mon, May 24, 2021 at 05:52:58PM +1000, David Gibson wrote:
>
>>>> I don't really see a semantic distinction between "always one-device
>>>> groups" and "groups don't matter". Really the only way you can afford
>>>> to not care about groups is if they're singletons.
>>>
>>> The kernel driver under the mdev may not be in an "always one-device"
>>> group.
>>
>> I don't really understand what you mean by that.
>
> I mean the group of the mdev's actual DMA device may have multiple
> things in it.
>
>>> It is a kernel driver so the only thing we know and care about is that
>>> all devices in the HW group are bound to kernel drivers.
>>>
>>> The vfio device that spawns from this kernel driver is really a
>>> "groups don't matter" vfio device because at the IOMMU layer it should
>>> be riding on the physical group of the kernel driver. At the VFIO
>>> layer we no longer care about the group abstraction because the system
>>> guarentees isolation in some other way.
>>
>> Uh.. I don't really know how mdevs are isolated from each other. I
>> thought it was because the physical device providing the mdevs
>> effectively had an internal IOMMU (or at least DMA permissioning) to
>> isolate the mdevs, even though the physical device may not be fully
>> isolated.
>>
>> In that case the virtual mdev is effectively in a singleton group,
>> which is different from the group of its parent device.
>
That's correct.
> That is one way to view it, but it means creating a whole group
> infrastructure and abusing the IOMMU stack just to create this
> nonsense fiction.
I really didn't get how this abuse the IOMMU stack.
mdev can be used in 3 different ways:
1. non-iommu backed mdev devices where mdev vendor driver takes care to
DMA map (iommu_map) and isolation is through device hardware internal
MMU. Here vfio_iommu_type1 module provides a way to validate and pin
pages required by mdev device for DMA mapping. Then IOMMU mapping is
done by mdev vendor driver which is owner driver of physical device.
2. iommu backed mdev devices for SRIOV where mdev device is created per
VF (mdev device == VF device) then that mdev device has same iommu
protection scope as VF associated to it. Here mdev device is virtual
device which uses features of mdev and represents underlying VF device,
same as vfio-pci but with additional mdev features.
3. iommu backed mdev devices for PASID with aux feature. I would not
comment on this, there has been a long discussion on this.
I don't think this is abusing IOMMU stack, atleast for 1 and 2 above.
Thanks,
Kirti
> We also abuse the VFIO container stuff to hackily
> create several different types pf IOMMU uAPIs for the mdev - all of
> which are unrelated to drivers/iommu.
>
> Basically, there is no drivers/iommu thing involved, thus is no really
> iommu group, for mdev it is all a big hacky lie.
>
>> If the physical device had a bug which meant the mdevs *weren't*
>> properly isolated from each other, then those mdevs would share a
>> group, and you *would* care about it. Depending on how the isolation
>> failed the mdevs might or might not also share a group with the parent
>> physical device.
>
> That isn't a real scenario.. mdevs that can't be isolated just
> wouldn't be useful to exist
>
>>> This is today's model, yes. When you run dpdk on a multi-group device
>>> vfio already ensures that all the device groups remained parked and
>>> inaccessible.
>>
>> I'm not really following what you're saying there.
>>
>> If you have a multi-device group, and dpdk is using one device in it,
>> VFIO *does not* (and cannot) ensure that other devices in the group
>> are parked and inaccessible.
>
> I mean in the sense that no other user space can open those devices
> and no kernel driver can later be attached to them.
>
>> It ensures that they're parked at the moment the group moves from
>> kernel to userspace ownership, but it can't prevent dpdk from
>> accessing and unparking those devices via peer to peer DMA.
>
> Right, and adding all this group stuff did nothing to alert the poor
> admin that is running DPDK to this risk.
>
>>> If the administator configures the system with different security
>>> labels for different VFIO devices then yes removing groups makes this
>>> more tricky as all devices in the group should have the same label.
>>
>> That seems a bigger problem than "more tricky". How would you propose
>> addressing this with your device-first model?
>
> You put the same security labels you'd put on the group to the devices
> that consitute the group. It is only more tricky in the sense that the
> script that would have to do this will need to do more than ID the
> group to label but also ID the device members of the group and label
> their char nodes.
>
> Jason
>
On 5/26/2021 4:22 AM, Alex Williamson wrote:
> On Wed, 26 May 2021 00:56:30 +0530
> Kirti Wankhede <[email protected]> wrote:
>
>> On 5/25/2021 5:07 AM, Jason Gunthorpe wrote:
>>> On Mon, May 24, 2021 at 05:52:58PM +1000, David Gibson wrote:
>>>
>>>>>> I don't really see a semantic distinction between "always one-device
>>>>>> groups" and "groups don't matter". Really the only way you can afford
>>>>>> to not care about groups is if they're singletons.
>>>>>
>>>>> The kernel driver under the mdev may not be in an "always one-device"
>>>>> group.
>>>>
>>>> I don't really understand what you mean by that.
>>>
>>> I mean the group of the mdev's actual DMA device may have multiple
>>> things in it.
>>>
>>>>> It is a kernel driver so the only thing we know and care about is that
>>>>> all devices in the HW group are bound to kernel drivers.
>>>>>
>>>>> The vfio device that spawns from this kernel driver is really a
>>>>> "groups don't matter" vfio device because at the IOMMU layer it should
>>>>> be riding on the physical group of the kernel driver. At the VFIO
>>>>> layer we no longer care about the group abstraction because the system
>>>>> guarentees isolation in some other way.
>>>>
>>>> Uh.. I don't really know how mdevs are isolated from each other. I
>>>> thought it was because the physical device providing the mdevs
>>>> effectively had an internal IOMMU (or at least DMA permissioning) to
>>>> isolate the mdevs, even though the physical device may not be fully
>>>> isolated.
>>>>
>>>> In that case the virtual mdev is effectively in a singleton group,
>>>> which is different from the group of its parent device.
>>>
>>
>> That's correct.
>>
>>> That is one way to view it, but it means creating a whole group
>>> infrastructure and abusing the IOMMU stack just to create this
>>> nonsense fiction.
>>
>> I really didn't get how this abuse the IOMMU stack.
>> mdev can be used in 3 different ways:
>> 1. non-iommu backed mdev devices where mdev vendor driver takes care to
>> DMA map (iommu_map) and isolation is through device hardware internal
>> MMU. Here vfio_iommu_type1 module provides a way to validate and pin
>> pages required by mdev device for DMA mapping. Then IOMMU mapping is
>> done by mdev vendor driver which is owner driver of physical device.
>>
>> 2. iommu backed mdev devices for SRIOV where mdev device is created per
>> VF (mdev device == VF device) then that mdev device has same iommu
>> protection scope as VF associated to it. Here mdev device is virtual
>> device which uses features of mdev and represents underlying VF device,
>> same as vfio-pci but with additional mdev features.
>
> What features would those be? There are no mdev specific parts of the
> vfio uAPI.
>
> The mdev device is a virtual device, by why it it virtual in this case?
> Aren't we effectively assigning the VF itself (mdev device == VF device)
> with a bunch of extra support code to fill in the gaps of the VF
> implementing the complete device model in hardware?
>
> We're effectively creating this virtual device, creating a fake IOMMU
> group, and trying to create this association of this virtual device to
> the real VF in order to shoehorn it into the mdev model. What do we
> get from that model other than lifecycle management (ie. type selection)
> and re-use of a bunch of code from the driver supporting the 1) model
> above?
>
Yes, the lifecycle management which is in mdev is not in vfio-pci variant.
> This specific model seems better served by a device specific peer
> driver to vfio-pci (ie. a "vfio-pci variant"). You effectively already
> have the code for this driver, it's just in the format of an mdev
> driver rather than a vfio "bus driver". The work Jason references
> relative to Max aims to make these kinds of drivers easier to implement
> through re-use of vfio-pci code.
>
> There are certainly other solutions we could come up with for selecting
> a specific device type for a vfio-pci variant driver to implement other
> than pretending this model actually belongs in mdev, right? Thanks,
>
Sure and would like to see type selection mechanism to be implemented in
vfio-pci variant.
Thanks,
Kirti
On Wed, 26 May 2021 23:40:02 +0530
Kirti Wankhede <[email protected]> wrote:
> On 5/26/2021 4:22 AM, Alex Williamson wrote:
> > On Wed, 26 May 2021 00:56:30 +0530
> > Kirti Wankhede <[email protected]> wrote:
> >
> >> On 5/25/2021 5:07 AM, Jason Gunthorpe wrote:
> >>> On Mon, May 24, 2021 at 05:52:58PM +1000, David Gibson wrote:
> >>>
> >>>>>> I don't really see a semantic distinction between "always one-device
> >>>>>> groups" and "groups don't matter". Really the only way you can afford
> >>>>>> to not care about groups is if they're singletons.
> >>>>>
> >>>>> The kernel driver under the mdev may not be in an "always one-device"
> >>>>> group.
> >>>>
> >>>> I don't really understand what you mean by that.
> >>>
> >>> I mean the group of the mdev's actual DMA device may have multiple
> >>> things in it.
> >>>
> >>>>> It is a kernel driver so the only thing we know and care about is that
> >>>>> all devices in the HW group are bound to kernel drivers.
> >>>>>
> >>>>> The vfio device that spawns from this kernel driver is really a
> >>>>> "groups don't matter" vfio device because at the IOMMU layer it should
> >>>>> be riding on the physical group of the kernel driver. At the VFIO
> >>>>> layer we no longer care about the group abstraction because the system
> >>>>> guarentees isolation in some other way.
> >>>>
> >>>> Uh.. I don't really know how mdevs are isolated from each other. I
> >>>> thought it was because the physical device providing the mdevs
> >>>> effectively had an internal IOMMU (or at least DMA permissioning) to
> >>>> isolate the mdevs, even though the physical device may not be fully
> >>>> isolated.
> >>>>
> >>>> In that case the virtual mdev is effectively in a singleton group,
> >>>> which is different from the group of its parent device.
> >>>
> >>
> >> That's correct.
> >>
> >>> That is one way to view it, but it means creating a whole group
> >>> infrastructure and abusing the IOMMU stack just to create this
> >>> nonsense fiction.
> >>
> >> I really didn't get how this abuse the IOMMU stack.
> >> mdev can be used in 3 different ways:
> >> 1. non-iommu backed mdev devices where mdev vendor driver takes care to
> >> DMA map (iommu_map) and isolation is through device hardware internal
> >> MMU. Here vfio_iommu_type1 module provides a way to validate and pin
> >> pages required by mdev device for DMA mapping. Then IOMMU mapping is
> >> done by mdev vendor driver which is owner driver of physical device.
> >>
> >> 2. iommu backed mdev devices for SRIOV where mdev device is created per
> >> VF (mdev device == VF device) then that mdev device has same iommu
> >> protection scope as VF associated to it. Here mdev device is virtual
> >> device which uses features of mdev and represents underlying VF device,
> >> same as vfio-pci but with additional mdev features.
> >
> > What features would those be? There are no mdev specific parts of the
> > vfio uAPI.
> >
> > The mdev device is a virtual device, by why it it virtual in this case?
> > Aren't we effectively assigning the VF itself (mdev device == VF device)
> > with a bunch of extra support code to fill in the gaps of the VF
> > implementing the complete device model in hardware?
> >
> > We're effectively creating this virtual device, creating a fake IOMMU
> > group, and trying to create this association of this virtual device to
> > the real VF in order to shoehorn it into the mdev model. What do we
> > get from that model other than lifecycle management (ie. type selection)
> > and re-use of a bunch of code from the driver supporting the 1) model
> > above?
> >
>
> Yes, the lifecycle management which is in mdev is not in vfio-pci variant.
>
> > This specific model seems better served by a device specific peer
> > driver to vfio-pci (ie. a "vfio-pci variant"). You effectively already
> > have the code for this driver, it's just in the format of an mdev
> > driver rather than a vfio "bus driver". The work Jason references
> > relative to Max aims to make these kinds of drivers easier to implement
> > through re-use of vfio-pci code.
> >
> > There are certainly other solutions we could come up with for selecting
> > a specific device type for a vfio-pci variant driver to implement other
> > than pretending this model actually belongs in mdev, right? Thanks,
> >
>
> Sure and would like to see type selection mechanism to be implemented in
> vfio-pci variant.
A driver provided sysfs attribute would obviously fill the short
term gap, long term maybe this would be standardized via netlink. It
seems a bit analogous to setting the MAC address for a VF on an SR-IOV
NIC or VF namespace configuration for an SR-IOV NVMe device. Thanks,
Alex
On Wed, May 26, 2021 at 12:59:05PM -0600, Alex Williamson wrote:
> A driver provided sysfs attribute would obviously fill the short
> term gap, long term maybe this would be standardized via netlink. It
> seems a bit analogous to setting the MAC address for a VF on an SR-IOV
> NIC or VF namespace configuration for an SR-IOV NVMe device. Thanks,
We have been doing a lot of work in netlink/devlink to program the VF
settings before starting the VF driver
I've long thought it would be good to standardize a VF lifecycle in
Linux eg VFs start their life in some 'unconfigured' state and drivers
don't bind, then userspace can use the PF to perform configuration,
finally the driver can start on a configured VF
We are already doing this model for the mlx5 'sub function' interfaces
which have alot of analogs to both VFs and Intel's SIOV ADIs. It seems
to be working.
Jason
On Tue, May 25, 2021 at 04:52:57PM -0300, Jason Gunthorpe wrote:
> On Wed, May 26, 2021 at 12:56:30AM +0530, Kirti Wankhede wrote:
>
> > 2. iommu backed mdev devices for SRIOV where mdev device is created per
> > VF (mdev device == VF device) then that mdev device has same iommu
> > protection scope as VF associated to it.
>
> This doesn't require, and certainly shouldn't create, a fake group.
It's only fake if you start with a narrow view of what a group is. A
group is a set of devices (in the kernel sense of "device", not
necessarily the hardware sense) which can't be isolated from each
other. The mdev device is a kernel device, and if working as intended
it can be isolated from everything else, and is therefore in an
absolute bona fide group of its own.
> Only the VF's real IOMMU group should be used to model an iommu domain
> linked to a VF. Injecting fake groups that are proxies for real groups
> only opens the possibility of security problems like David is
> concerned with.
It's not a proxy for a real group, it's a group of its own. If you
discover that (due to a hardware bug, for example) the mdev is *not*
properly isolated from its parent PCI device, then both the mdev
virtual device *and* the physical PCI device are in the same group.
Groups including devices of different types and on different buses
were considered from the start, and are precedented, if rare.
> Max's series approaches this properly by fully linking the struct
> pci_device of the VF throughout the entire VFIO scheme, including the
> group and container, while still allowing override of various VFIO
> operations.
>
> Jason
>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Mon, May 24, 2021 at 08:37:44PM -0300, Jason Gunthorpe wrote:
> On Mon, May 24, 2021 at 05:52:58PM +1000, David Gibson wrote:
>
> > > > I don't really see a semantic distinction between "always one-device
> > > > groups" and "groups don't matter". Really the only way you can afford
> > > > to not care about groups is if they're singletons.
> > >
> > > The kernel driver under the mdev may not be in an "always one-device"
> > > group.
> >
> > I don't really understand what you mean by that.
>
> I mean the group of the mdev's actual DMA device may have multiple
> things in it.
>
> > > It is a kernel driver so the only thing we know and care about is that
> > > all devices in the HW group are bound to kernel drivers.
> > >
> > > The vfio device that spawns from this kernel driver is really a
> > > "groups don't matter" vfio device because at the IOMMU layer it should
> > > be riding on the physical group of the kernel driver. At the VFIO
> > > layer we no longer care about the group abstraction because the system
> > > guarentees isolation in some other way.
> >
> > Uh.. I don't really know how mdevs are isolated from each other. I
> > thought it was because the physical device providing the mdevs
> > effectively had an internal IOMMU (or at least DMA permissioning) to
> > isolate the mdevs, even though the physical device may not be fully
> > isolated.
> >
> > In that case the virtual mdev is effectively in a singleton group,
> > which is different from the group of its parent device.
>
> That is one way to view it, but it means creating a whole group
> infrastructure and abusing the IOMMU stack just to create this
> nonsense fiction.
It's a nonsense fiction until it's not, at which point it will bite
you in the arse.
> We also abuse the VFIO container stuff to hackily
> create several different types pf IOMMU uAPIs for the mdev - all of
> which are unrelated to drivers/iommu.
>
> Basically, there is no drivers/iommu thing involved, thus is no really
> iommu group, for mdev it is all a big hacky lie.
Well, "iommu" group might not be the best name, but hardware isolation
is still a real concern here, even if it's not entirely related to the
IOMMU.
> > If the physical device had a bug which meant the mdevs *weren't*
> > properly isolated from each other, then those mdevs would share a
> > group, and you *would* care about it. Depending on how the isolation
> > failed the mdevs might or might not also share a group with the parent
> > physical device.
>
> That isn't a real scenario.. mdevs that can't be isolated just
> wouldn't be useful to exist
Really? So what do you do when you discover some mdevs you thought
were isolated actually aren't due to a hardware bug? Drop support
from the driver entirely? In which case what do you say to the people
who understandably complain "but... we had all the mdevs in one guest
anyway, we don't care if they're not isolated"?
> > > This is today's model, yes. When you run dpdk on a multi-group device
> > > vfio already ensures that all the device groups remained parked and
> > > inaccessible.
> >
> > I'm not really following what you're saying there.
> >
> > If you have a multi-device group, and dpdk is using one device in it,
> > VFIO *does not* (and cannot) ensure that other devices in the group
> > are parked and inaccessible.
>
> I mean in the sense that no other user space can open those devices
> and no kernel driver can later be attached to them.
Ok.
> > It ensures that they're parked at the moment the group moves from
> > kernel to userspace ownership, but it can't prevent dpdk from
> > accessing and unparking those devices via peer to peer DMA.
>
> Right, and adding all this group stuff did nothing to alert the poor
> admin that is running DPDK to this risk.
Didn't it? Seems to me the admin that in order to give the group to
DPDK, the admin had to find and unbind all the things in it... so is
therefore aware that they're giving everything in it to DPDK.
> > > If the administator configures the system with different security
> > > labels for different VFIO devices then yes removing groups makes this
> > > more tricky as all devices in the group should have the same label.
> >
> > That seems a bigger problem than "more tricky". How would you propose
> > addressing this with your device-first model?
>
> You put the same security labels you'd put on the group to the devices
> that consitute the group. It is only more tricky in the sense that the
> script that would have to do this will need to do more than ID the
> group to label but also ID the device members of the group and label
> their char nodes.
Well, I guess, if you take the view that root is allowed to break the
kernel. I tend to prefer that although root can obviously break the
kernel if they intend do, we should make it hard to do by accident -
which in this case would mean the kernel *enforcing* that the devices
in the group have the same security labels, which I can't really see
how to do without an exposed group.
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Wed, May 26, 2021 at 02:48:03AM +0530, Kirti Wankhede wrote:
>
>
> On 5/26/2021 1:22 AM, Jason Gunthorpe wrote:
> > On Wed, May 26, 2021 at 12:56:30AM +0530, Kirti Wankhede wrote:
> >
> > > 2. iommu backed mdev devices for SRIOV where mdev device is created per
> > > VF (mdev device == VF device) then that mdev device has same iommu
> > > protection scope as VF associated to it.
> >
> > This doesn't require, and certainly shouldn't create, a fake group.
> >
> > Only the VF's real IOMMU group should be used to model an iommu domain
> > linked to a VF. Injecting fake groups that are proxies for real groups
> > only opens the possibility of security problems like David is
> > concerned with.
> >
>
> I think this security issue should be addressed by letting mdev device
> inherit its parent's iommu_group, i.e. VF's iommu_group here.
No, that doesn't work. AIUI part of the whole point of mdevs is to
allow chunks of a single PCI function to be handed out to different
places, because they're isolated from each other not by the system
IOMMU, but by a combination of MMU hardware in the hardware (e.g. in a
GPU card) and software in the mdev driver. If mdevs inherited the
group of their parent device they wouldn't count as isolated from each
other, which they should.
>
> Kirti
>
> > Max's series approaches this properly by fully linking the struct
> > pci_device of the VF throughout the entire VFIO scheme, including the
> > group and container, while still allowing override of various VFIO
> > operations.
> >
> > Jason
> >
>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On 5/27/2021 10:30 AM, David Gibson wrote:
> On Wed, May 26, 2021 at 02:48:03AM +0530, Kirti Wankhede wrote:
>>
>>
>> On 5/26/2021 1:22 AM, Jason Gunthorpe wrote:
>>> On Wed, May 26, 2021 at 12:56:30AM +0530, Kirti Wankhede wrote:
>>>
>>>> 2. iommu backed mdev devices for SRIOV where mdev device is created per
>>>> VF (mdev device == VF device) then that mdev device has same iommu
>>>> protection scope as VF associated to it.
>>>
>>> This doesn't require, and certainly shouldn't create, a fake group.
>>>
>>> Only the VF's real IOMMU group should be used to model an iommu domain
>>> linked to a VF. Injecting fake groups that are proxies for real groups
>>> only opens the possibility of security problems like David is
>>> concerned with.
>>>
>>
>> I think this security issue should be addressed by letting mdev device
>> inherit its parent's iommu_group, i.e. VF's iommu_group here.
>
> No, that doesn't work. AIUI part of the whole point of mdevs is to
> allow chunks of a single PCI function to be handed out to different
> places, because they're isolated from each other not by the system
> IOMMU, but by a combination of MMU hardware in the hardware (e.g. in a
> GPU card) and software in the mdev driver.
That's correct for non-iommu backed mdev devices.
> If mdevs inherited the
> group of their parent device they wouldn't count as isolated from each
> other, which they should.
>
For iommu backed mdev devices for SRIOV, where there can be single mdev
device for its parent, here parent device is VF, there can't be multiple
mdev devices associated with that VF. In this case mdev can inherit the
group of parent device.
Kirti
On Thu, May 27, 2021 at 02:58:30PM +1000, David Gibson wrote:
> On Tue, May 25, 2021 at 04:52:57PM -0300, Jason Gunthorpe wrote:
> > On Wed, May 26, 2021 at 12:56:30AM +0530, Kirti Wankhede wrote:
> >
> > > 2. iommu backed mdev devices for SRIOV where mdev device is created per
> > > VF (mdev device == VF device) then that mdev device has same iommu
> > > protection scope as VF associated to it.
> >
> > This doesn't require, and certainly shouldn't create, a fake group.
>
> It's only fake if you start with a narrow view of what a group is.
A group is connected to drivers/iommu. A group object without *any*
relation to drivers/iommu is just a complete fiction, IMHO.
> > Only the VF's real IOMMU group should be used to model an iommu domain
> > linked to a VF. Injecting fake groups that are proxies for real groups
> > only opens the possibility of security problems like David is
> > concerned with.
>
> It's not a proxy for a real group, it's a group of its own. If you
> discover that (due to a hardware bug, for example) the mdev is *not*
What Kirti is talking about here is the case where a mdev is wrapped
around a VF and the DMA isolation stems directly from the SRIOV VF's
inherent DMA isolation, not anything the mdev wrapper did.
The group providing the isolation is the VF's group.
The group mdev implicitly creates is just a fake proxy that comes
along with mdev API. It doesn't do anything and it doesn't mean
anything.
> properly isolated from its parent PCI device, then both the mdev
> virtual device *and* the physical PCI device are in the same group.
> Groups including devices of different types and on different buses
> were considered from the start, and are precedented, if rare.
This is far too theoretical for me. A security broken mdev is
functionally useless.
We don't need to support it, and we don't need complicated software to
model it.
Jason
On Thu, May 27, 2021 at 02:53:42PM +1000, David Gibson wrote:
> > > If the physical device had a bug which meant the mdevs *weren't*
> > > properly isolated from each other, then those mdevs would share a
> > > group, and you *would* care about it. Depending on how the isolation
> > > failed the mdevs might or might not also share a group with the parent
> > > physical device.
> >
> > That isn't a real scenario.. mdevs that can't be isolated just
> > wouldn't be useful to exist
>
> Really? So what do you do when you discover some mdevs you thought
> were isolated actually aren't due to a hardware bug? Drop support
> from the driver entirely? In which case what do you say to the people
> who understandably complain "but... we had all the mdevs in one guest
> anyway, we don't care if they're not isolated"?
I've never said to eliminate groups entirely.
What I'm saying is that all the cases we have for mdev today do not
require groups, but are forced to create a fake group anyhow just to
satisfy the odd VFIO requirement to have a group FD.
If some future mdev needs groups then sure, add the appropriate group
stuff.
But that doesn't effect the decision to have a VFIO group FD, or not.
> > > It ensures that they're parked at the moment the group moves from
> > > kernel to userspace ownership, but it can't prevent dpdk from
> > > accessing and unparking those devices via peer to peer DMA.
> >
> > Right, and adding all this group stuff did nothing to alert the poor
> > admin that is running DPDK to this risk.
>
> Didn't it? Seems to me the admin that in order to give the group to
> DPDK, the admin had to find and unbind all the things in it... so is
> therefore aware that they're giving everything in it to DPDK.
Again, I've never said the *group* should be removed. I'm only
concerned about the *group FD*
When the admin found and unbound they didn't use the *group FD* in any
way.
> > You put the same security labels you'd put on the group to the devices
> > that consitute the group. It is only more tricky in the sense that the
> > script that would have to do this will need to do more than ID the
> > group to label but also ID the device members of the group and label
> > their char nodes.
>
> Well, I guess, if you take the view that root is allowed to break the
> kernel. I tend to prefer that although root can obviously break the
> kernel if they intend do, we should make it hard to do by accident -
> which in this case would mean the kernel *enforcing* that the devices
> in the group have the same security labels, which I can't really see
> how to do without an exposed group.
How is this "break the kernel"? It has nothing to do with the
kernel. Security labels are a user space concern.
Jason
On Thu, May 27, 2021 at 11:55:00PM +0530, Kirti Wankhede wrote:
>
>
> On 5/27/2021 10:30 AM, David Gibson wrote:
> > On Wed, May 26, 2021 at 02:48:03AM +0530, Kirti Wankhede wrote:
> > >
> > >
> > > On 5/26/2021 1:22 AM, Jason Gunthorpe wrote:
> > > > On Wed, May 26, 2021 at 12:56:30AM +0530, Kirti Wankhede wrote:
> > > >
> > > > > 2. iommu backed mdev devices for SRIOV where mdev device is created per
> > > > > VF (mdev device == VF device) then that mdev device has same iommu
> > > > > protection scope as VF associated to it.
> > > >
> > > > This doesn't require, and certainly shouldn't create, a fake group.
> > > >
> > > > Only the VF's real IOMMU group should be used to model an iommu domain
> > > > linked to a VF. Injecting fake groups that are proxies for real groups
> > > > only opens the possibility of security problems like David is
> > > > concerned with.
> > > >
> > >
> > > I think this security issue should be addressed by letting mdev device
> > > inherit its parent's iommu_group, i.e. VF's iommu_group here.
> >
> > No, that doesn't work. AIUI part of the whole point of mdevs is to
> > allow chunks of a single PCI function to be handed out to different
> > places, because they're isolated from each other not by the system
> > IOMMU, but by a combination of MMU hardware in the hardware (e.g. in a
> > GPU card) and software in the mdev driver.
>
> That's correct for non-iommu backed mdev devices.
>
> > If mdevs inherited the
> > group of their parent device they wouldn't count as isolated from each
> > other, which they should.
> >
>
> For iommu backed mdev devices for SRIOV, where there can be single mdev
> device for its parent, here parent device is VF, there can't be multiple
> mdev devices associated with that VF. In this case mdev can inherit the
> group of parent device.
Ah, yes, if there's just one mdev for the PCI function, and the
function doesn't have an internal memory protection unit then this
makes sense.
Which means we *do* have at least two meaningfully different group
configurations for mdev:
* mdev is in a singleton group independent of the parent PCI device
* mdev shares a group with its parent PCI device
Which means even in the case of mdevs, the group structure is *not* a
meaningless fiction.
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Thu, May 27, 2021 at 04:06:20PM -0300, Jason Gunthorpe wrote:
> On Thu, May 27, 2021 at 02:53:42PM +1000, David Gibson wrote:
>
> > > > If the physical device had a bug which meant the mdevs *weren't*
> > > > properly isolated from each other, then those mdevs would share a
> > > > group, and you *would* care about it. Depending on how the isolation
> > > > failed the mdevs might or might not also share a group with the parent
> > > > physical device.
> > >
> > > That isn't a real scenario.. mdevs that can't be isolated just
> > > wouldn't be useful to exist
> >
> > Really? So what do you do when you discover some mdevs you thought
> > were isolated actually aren't due to a hardware bug? Drop support
> > from the driver entirely? In which case what do you say to the people
> > who understandably complain "but... we had all the mdevs in one guest
> > anyway, we don't care if they're not isolated"?
>
> I've never said to eliminate groups entirely.
>
> What I'm saying is that all the cases we have for mdev today do not
> require groups, but are forced to create a fake group anyhow just to
> satisfy the odd VFIO requirement to have a group FD.
>
> If some future mdev needs groups then sure, add the appropriate group
> stuff.
>
> But that doesn't effect the decision to have a VFIO group FD, or not.
>
> > > > It ensures that they're parked at the moment the group moves from
> > > > kernel to userspace ownership, but it can't prevent dpdk from
> > > > accessing and unparking those devices via peer to peer DMA.
> > >
> > > Right, and adding all this group stuff did nothing to alert the poor
> > > admin that is running DPDK to this risk.
> >
> > Didn't it? Seems to me the admin that in order to give the group to
> > DPDK, the admin had to find and unbind all the things in it... so is
> > therefore aware that they're giving everything in it to DPDK.
>
> Again, I've never said the *group* should be removed. I'm only
> concerned about the *group FD*
Ok, that wasn't really clear to me.
I still wouldn't say the group for mdevs is a fiction though.. rather
that the group device used for (no internal IOMMU case) mdevs is just
plain wrong.
> When the admin found and unbound they didn't use the *group FD* in any
> way.
No, they are likely to have changed permissions on the group device
node as part of the process, though.
> > > You put the same security labels you'd put on the group to the devices
> > > that consitute the group. It is only more tricky in the sense that the
> > > script that would have to do this will need to do more than ID the
> > > group to label but also ID the device members of the group and label
> > > their char nodes.
> >
> > Well, I guess, if you take the view that root is allowed to break the
> > kernel. I tend to prefer that although root can obviously break the
> > kernel if they intend do, we should make it hard to do by accident -
> > which in this case would mean the kernel *enforcing* that the devices
> > in the group have the same security labels, which I can't really see
> > how to do without an exposed group.
>
> How is this "break the kernel"? It has nothing to do with the
> kernel. Security labels are a user space concern.
*thinks*... yeah, ok, that was much too strong an assertion. What I
was thinking of is the fact that this means that guarantees you'd
normally expect the kernel to enforce can be obviated by bad
configuration: chown-ing a device to root doesn't actually protect it
if there's another device in the same group exposed to other users.
But I guess you could say the same about, say, an unauthenticated nbd
export of a root-owned block device, so I guess that's not something
the kernel can reasonably enforce.
Ok.. you might be finally convincing me, somewhat.
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Thu, May 27, 2021 at 03:48:47PM -0300, Jason Gunthorpe wrote:
> On Thu, May 27, 2021 at 02:58:30PM +1000, David Gibson wrote:
> > On Tue, May 25, 2021 at 04:52:57PM -0300, Jason Gunthorpe wrote:
> > > On Wed, May 26, 2021 at 12:56:30AM +0530, Kirti Wankhede wrote:
> > >
> > > > 2. iommu backed mdev devices for SRIOV where mdev device is created per
> > > > VF (mdev device == VF device) then that mdev device has same iommu
> > > > protection scope as VF associated to it.
> > >
> > > This doesn't require, and certainly shouldn't create, a fake group.
> >
> > It's only fake if you start with a narrow view of what a group is.
>
> A group is connected to drivers/iommu. A group object without *any*
> relation to drivers/iommu is just a complete fiction, IMHO.
That might be where we differ. As I've said, my group I'm primarily
meaning the fundamental hardware unit of isolation. *Usually* that's
determined by the capabilities of an IOMMU, but in some cases it might
not be. In either case, the boundaries still matter.
> > > Only the VF's real IOMMU group should be used to model an iommu domain
> > > linked to a VF. Injecting fake groups that are proxies for real groups
> > > only opens the possibility of security problems like David is
> > > concerned with.
> >
> > It's not a proxy for a real group, it's a group of its own. If you
> > discover that (due to a hardware bug, for example) the mdev is *not*
>
> What Kirti is talking about here is the case where a mdev is wrapped
> around a VF and the DMA isolation stems directly from the SRIOV VF's
> inherent DMA isolation, not anything the mdev wrapper did.
>
> The group providing the isolation is the VF's group.
Yes, in that case the mdev absolutely should be in the VF's group -
having its own group is not just messy but incorrect.
> The group mdev implicitly creates is just a fake proxy that comes
> along with mdev API. It doesn't do anything and it doesn't mean
> anything.
But.. the case of multiple mdevs managed by a single PCI device with
an internal IOMMU also exists, and then the mdev groups are *not*
proxies but true groups independent of the parent device. Which means
that the group structure of mdevs can vary, which is an argument *for*
keeping it, not against.
> > properly isolated from its parent PCI device, then both the mdev
> > virtual device *and* the physical PCI device are in the same group.
> > Groups including devices of different types and on different buses
> > were considered from the start, and are precedented, if rare.
>
> This is far too theoretical for me. A security broken mdev is
> functionally useless.
Is it, though? Again, I'm talking about the case of multiple mdevs
with a single parent device (because that's the only case I was aware
of until recently). Isolation comes from a device-internal
IOMMU... that turns out to be broken. But if your security domain
happens to include all the mdevs on the device anyway, then you don't
care.
Are you really going to say people can't use their fancy hardware in
this mode because it has a security flaw that's not relevant to their
usecase?
And then.. there's Kirti's case. In that case the mdev should belong
to its parent PCI device's group since that's what's providing
isolation. But in that case the parent device can be in a
multi-device group for any of the usual reasons (PCIe-to-PCI bridge,
PCIe switch with broken ACS, multifunction device with crosstalk).
Which means the mdev also shares a group with those other device. So
again, the group structure matters and is not a fiction.
> We don't need to support it, and we don't need complicated software to
> model it.
>
> Jason
>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Tue, Jun 01, 2021 at 02:03:33PM +1000, David Gibson wrote:
> On Thu, May 27, 2021 at 03:48:47PM -0300, Jason Gunthorpe wrote:
> > On Thu, May 27, 2021 at 02:58:30PM +1000, David Gibson wrote:
> > > On Tue, May 25, 2021 at 04:52:57PM -0300, Jason Gunthorpe wrote:
> > > > On Wed, May 26, 2021 at 12:56:30AM +0530, Kirti Wankhede wrote:
> > > >
> > > > > 2. iommu backed mdev devices for SRIOV where mdev device is created per
> > > > > VF (mdev device == VF device) then that mdev device has same iommu
> > > > > protection scope as VF associated to it.
> > > >
> > > > This doesn't require, and certainly shouldn't create, a fake group.
> > >
> > > It's only fake if you start with a narrow view of what a group is.
> >
> > A group is connected to drivers/iommu. A group object without *any*
> > relation to drivers/iommu is just a complete fiction, IMHO.
>
> That might be where we differ. As I've said, my group I'm primarily
> meaning the fundamental hardware unit of isolation. *Usually* that's
> determined by the capabilities of an IOMMU, but in some cases it might
> not be. In either case, the boundaries still matter.
As in my other email we absolutely need a group concept, it is just a
question of how the user API is designed around it.
> > The group mdev implicitly creates is just a fake proxy that comes
> > along with mdev API. It doesn't do anything and it doesn't mean
> > anything.
>
> But.. the case of multiple mdevs managed by a single PCI device with
> an internal IOMMU also exists, and then the mdev groups are *not*
> proxies but true groups independent of the parent device. Which means
> that the group structure of mdevs can vary, which is an argument *for*
> keeping it, not against.
If VFIO becomes more "vfio_device" centric then the vfio_device itself
has some properties. One of those can be "is it inside a drivers/iommu
group, or not?".
If the vfio_device is not using a drivers/iommu IOMMU interface then
it can just have no group at all - no reason to lie. This would mean
that the device has perfect isolation.
What I don't like is forcing certain things depending on how the
vfio_device was created - for instance forcing a IOMMU group as part
and forcing an ugly "SW IOMMU" mode in the container only as part of
mdev_device.
These should all be properties of the vfio_device itself.
Again this is all about the group fd - and how to fit in with the
/dev/ioasid proposal from Kevin:
https://lore.kernel.org/kvm/MWHPR11MB1886422D4839B372C6AB245F8C239@MWHPR11MB1886.namprd11.prod.outlook.com/
Focusing on vfio_device and skipping the group fd smooths out some
rough edges.
Code wise we are not quite there, but I have mapped out eliminating
the group from the vfio_device centric API and a few other places it
has crept in.
The group can exist in the background to enforce security without
being a cornerstone of the API design.
Jason
On Tue, Jun 01, 2021 at 09:57:12AM -0300, Jason Gunthorpe wrote:
> On Tue, Jun 01, 2021 at 02:03:33PM +1000, David Gibson wrote:
> > On Thu, May 27, 2021 at 03:48:47PM -0300, Jason Gunthorpe wrote:
> > > On Thu, May 27, 2021 at 02:58:30PM +1000, David Gibson wrote:
> > > > On Tue, May 25, 2021 at 04:52:57PM -0300, Jason Gunthorpe wrote:
> > > > > On Wed, May 26, 2021 at 12:56:30AM +0530, Kirti Wankhede wrote:
> > > > >
> > > > > > 2. iommu backed mdev devices for SRIOV where mdev device is created per
> > > > > > VF (mdev device == VF device) then that mdev device has same iommu
> > > > > > protection scope as VF associated to it.
> > > > >
> > > > > This doesn't require, and certainly shouldn't create, a fake group.
> > > >
> > > > It's only fake if you start with a narrow view of what a group is.
> > >
> > > A group is connected to drivers/iommu. A group object without *any*
> > > relation to drivers/iommu is just a complete fiction, IMHO.
> >
> > That might be where we differ. As I've said, my group I'm primarily
> > meaning the fundamental hardware unit of isolation. *Usually* that's
> > determined by the capabilities of an IOMMU, but in some cases it might
> > not be. In either case, the boundaries still matter.
>
> As in my other email we absolutely need a group concept, it is just a
> question of how the user API is designed around it.
>
> > > The group mdev implicitly creates is just a fake proxy that comes
> > > along with mdev API. It doesn't do anything and it doesn't mean
> > > anything.
> >
> > But.. the case of multiple mdevs managed by a single PCI device with
> > an internal IOMMU also exists, and then the mdev groups are *not*
> > proxies but true groups independent of the parent device. Which means
> > that the group structure of mdevs can vary, which is an argument *for*
> > keeping it, not against.
>
> If VFIO becomes more "vfio_device" centric then the vfio_device itself
> has some properties. One of those can be "is it inside a drivers/iommu
> group, or not?".
>
> If the vfio_device is not using a drivers/iommu IOMMU interface then
> it can just have no group at all - no reason to lie. This would mean
> that the device has perfect isolation.
When you say "not using a drivers/iommu IOMMU interface" do you
basically mean the device doesn't do DMA? I can see some benefit to
that, but some drawbacks too. The *main* form of isolation (or lack
thereof) that groups is about the IOMMU, but groups can also represent
other forms of isolation failure: e.g. a multifunction device, where
function 0 has some debug registers which affect other functions.
That's relevant whether or not any of those functions use DMA.
Now, we could represent those different sorts of isolation separately,
but at the time our thinking was that we should group together devices
that can't be safely isolated for *any* reason, since the practical
upshot is the same: you can't safely split those devices between
different owners.
> What I don't like is forcing certain things depending on how the
> vfio_device was created - for instance forcing a IOMMU group as part
> and forcing an ugly "SW IOMMU" mode in the container only as part of
> mdev_device.
I don't really see how this is depending on how the device is created.
The current VFIO model is that every device always belongs to a group
- but that group might be a singleton. That seems less complicated to
me that some devices do and some don't have a group.
> These should all be properties of the vfio_device itself.
>
> Again this is all about the group fd - and how to fit in with the
> /dev/ioasid proposal from Kevin:
>
> https://lore.kernel.org/kvm/MWHPR11MB1886422D4839B372C6AB245F8C239@MWHPR11MB1886.namprd11.prod.outlook.com/
>
> Focusing on vfio_device and skipping the group fd smooths out some
> rough edges.
>
> Code wise we are not quite there, but I have mapped out eliminating
> the group from the vfio_device centric API and a few other places it
> has crept in.
>
> The group can exist in the background to enforce security without
> being a cornerstone of the API design.
>
> Jason
>
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
On Tue, Jun 08, 2021 at 10:44:31AM +1000, David Gibson wrote:
> When you say "not using a drivers/iommu IOMMU interface" do you
> basically mean the device doesn't do DMA?
No, I mean the device doesn't use iommu_map() to manage the DMA
mappings.
vfio_iommu_type1 has a special code path that mdev triggers that
doesn't allocate an IOMMU domain and doesn't call iommu_map() or
anything related to that.
Instead a mdev driver calls vfio_pin_pages() which "reads" a fake page
table and returns back the CPU pages for the mdev to DMA map however
it likes.
> Now, we could represent those different sorts of isolation separately,
> but at the time our thinking was that we should group together devices
> that can't be safely isolated for *any* reason, since the practical
> upshot is the same: you can't safely split those devices between
> different owners.
It is fine, but the direction is going the other way, devices have
perfect ioslation and rely on special interactions with the iommu to
get it.
> > What I don't like is forcing certain things depending on how the
> > vfio_device was created - for instance forcing a IOMMU group as part
> > and forcing an ugly "SW IOMMU" mode in the container only as part of
> > mdev_device.
>
> I don't really see how this is depending on how the device is
> created.
static int vfio_iommu_type1_attach_group(void *iommu_data,
struct iommu_group
*iommu_group)
{
if (vfio_bus_is_mdev(bus)) {
What the iommu code does depends on how the device was created. This
is really ugly.
This is happening becaus the three objects in the model:
driver/group/domain are not being linked together in a way that
reflects the modern world.
The group has no idea what the driver wants but is in charge of
creating the domain on behalf of the device.
And so people have been created complicated hackery to pass
information from the driver to the group, through the device, so that
the group can create the right domain.
I want to see the driver simply create the right domain directly. It
is much simpler and scales to more domain complexity.
Jason