This patch series converts the Intel IOMMU to use the generic sva
helpers defined in iommu-sva-lib.c and io-pgfault.c. It includes the
SVA pasid management and IO page fault handling.
This series could be divided into below parts:
PATCH[1~3]: Use iommu_sva_alloc/free_pasid() to manage the SVA pasid.
PATCH[4-6]: Use the generic sva io page fault handler iommu_queue_iopf.
PATCH[7-11]: Add ftrace and debugfs support for io page fault handling.
Please help to review.
Best regards,
baolu
Lu Baolu (11):
iommu/vt-d: Add pasid private data helpers
iommu/vt-d: Use iommu_sva_alloc(free)_pasid() helpers
iommu/vt-d: Use common helper to lookup svm devices
iommu/vt-d: Refactor prq_event_thread()
iommu/vt-d: Allocate/register iopf queue for sva devices
iommu/vt-d: Report prq to io-pgfault framework
iommu/vt-d: Add prq_report trace event
iommu/vt-d: Add common code for dmar latency performance monitors
iommu/vt-d: Expose latency monitor data through debugfs
iommu/vt-d: Add cache invalidation latency sampling
iommu/vt-d: Add PRQ handling latency sampling
include/linux/intel-iommu.h | 33 +-
drivers/iommu/intel/perf.h | 73 ++++
include/trace/events/intel_iommu.h | 37 ++
drivers/iommu/intel/debugfs.c | 111 +++++
drivers/iommu/intel/dmar.c | 31 ++
drivers/iommu/intel/iommu.c | 73 +++-
drivers/iommu/intel/perf.c | 166 ++++++++
drivers/iommu/intel/svm.c | 627 ++++++++++++++---------------
drivers/iommu/intel/Kconfig | 5 +
drivers/iommu/intel/Makefile | 1 +
10 files changed, 819 insertions(+), 338 deletions(-)
create mode 100644 drivers/iommu/intel/perf.h
create mode 100644 drivers/iommu/intel/perf.c
--
2.25.1
We are about to use iommu_sva_alloc/free_pasid() helpers in iommu core.
That means the pasid life cycle will be managed by iommu core. Use a
local array to save the per pasid private data instead of attaching it
the real pasid.
Signed-off-by: Lu Baolu <[email protected]>
---
drivers/iommu/intel/svm.c | 62 ++++++++++++++++++++++++++-------------
1 file changed, 41 insertions(+), 21 deletions(-)
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 5165cea90421..82b0627ad7e7 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -17,6 +17,7 @@
#include <linux/dmar.h>
#include <linux/interrupt.h>
#include <linux/mm_types.h>
+#include <linux/xarray.h>
#include <linux/ioasid.h>
#include <asm/page.h>
#include <asm/fpu/api.h>
@@ -28,6 +29,23 @@ static void intel_svm_drain_prq(struct device *dev, u32 pasid);
#define PRQ_ORDER 0
+static DEFINE_XARRAY_ALLOC(pasid_private_array);
+static int pasid_private_add(ioasid_t pasid, void *priv)
+{
+ return xa_alloc(&pasid_private_array, &pasid, priv,
+ XA_LIMIT(pasid, pasid), GFP_ATOMIC);
+}
+
+static void pasid_private_remove(ioasid_t pasid)
+{
+ xa_erase(&pasid_private_array, pasid);
+}
+
+static void *pasid_private_find(ioasid_t pasid)
+{
+ return xa_load(&pasid_private_array, pasid);
+}
+
int intel_svm_enable_prq(struct intel_iommu *iommu)
{
struct page *pages;
@@ -224,7 +242,7 @@ static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
return -EINVAL;
- svm = ioasid_find(NULL, pasid, NULL);
+ svm = pasid_private_find(pasid);
if (IS_ERR(svm))
return PTR_ERR(svm);
@@ -334,7 +352,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
svm->gpasid = data->gpasid;
svm->flags |= SVM_FLAG_GUEST_PASID;
}
- ioasid_set_data(data->hpasid, svm);
+ pasid_private_add(data->hpasid, svm);
INIT_LIST_HEAD_RCU(&svm->devs);
mmput(svm->mm);
}
@@ -388,7 +406,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
list_add_rcu(&sdev->list, &svm->devs);
out:
if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
- ioasid_set_data(data->hpasid, NULL);
+ pasid_private_remove(data->hpasid);
kfree(svm);
}
@@ -431,7 +449,7 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
* the unbind, IOMMU driver will get notified
* and perform cleanup.
*/
- ioasid_set_data(pasid, NULL);
+ pasid_private_remove(pasid);
kfree(svm);
}
}
@@ -547,8 +565,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
svm = kzalloc(sizeof(*svm), GFP_KERNEL);
if (!svm) {
ret = -ENOMEM;
- kfree(sdev);
- goto out;
+ goto sdev_err;
}
if (pasid_max > intel_pasid_max_id)
@@ -556,13 +573,16 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
/* Do not use PASID 0, reserved for RID to PASID */
svm->pasid = ioasid_alloc(NULL, PASID_MIN,
- pasid_max - 1, svm);
+ pasid_max - 1, NULL);
if (svm->pasid == INVALID_IOASID) {
- kfree(svm);
- kfree(sdev);
ret = -ENOSPC;
- goto out;
+ goto svm_err;
}
+
+ ret = pasid_private_add(svm->pasid, svm);
+ if (ret)
+ goto pasid_err;
+
svm->notifier.ops = &intel_mmuops;
svm->mm = mm;
svm->flags = flags;
@@ -571,12 +591,8 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
ret = -ENOMEM;
if (mm) {
ret = mmu_notifier_register(&svm->notifier, mm);
- if (ret) {
- ioasid_put(svm->pasid);
- kfree(svm);
- kfree(sdev);
- goto out;
- }
+ if (ret)
+ goto priv_err;
}
spin_lock_irqsave(&iommu->lock, iflags);
@@ -590,8 +606,13 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
if (ret) {
if (mm)
mmu_notifier_unregister(&svm->notifier, mm);
+priv_err:
+ pasid_private_remove(svm->pasid);
+pasid_err:
ioasid_put(svm->pasid);
+svm_err:
kfree(svm);
+sdev_err:
kfree(sdev);
goto out;
}
@@ -614,10 +635,8 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
(cpu_feature_enabled(X86_FEATURE_LA57) ?
PASID_FLAG_FL5LP : 0));
spin_unlock_irqrestore(&iommu->lock, iflags);
- if (ret) {
- kfree(sdev);
- goto out;
- }
+ if (ret)
+ goto sdev_err;
}
list_add_rcu(&sdev->list, &svm->devs);
success:
@@ -670,6 +689,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
load_pasid(svm->mm, PASID_DISABLED);
}
list_del(&svm->list);
+ pasid_private_remove(svm->pasid);
/* We mandate that no page faults may be outstanding
* for the PASID when intel_svm_unbind_mm() is called.
* If that is not obeyed, subtle errors will happen.
@@ -924,7 +944,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
}
if (!svm || svm->pasid != req->pasid) {
rcu_read_lock();
- svm = ioasid_find(NULL, req->pasid, NULL);
+ svm = pasid_private_find(req->pasid);
/* It *can't* go away, because the driver is not permitted
* to unbind the mm while any page faults are outstanding.
* So we only need RCU to protect the internal idr code. */
--
2.25.1
Align the pasid alloc/free code with the generic helpers defined in the
iommu core. This also refactored the SVA binding code to improve the
readability.
Signed-off-by: Lu Baolu <[email protected]>
---
include/linux/intel-iommu.h | 1 -
drivers/iommu/intel/iommu.c | 3 +
drivers/iommu/intel/svm.c | 278 +++++++++++++++---------------------
drivers/iommu/intel/Kconfig | 1 +
4 files changed, 120 insertions(+), 163 deletions(-)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 03faf20a6817..4e8bb186daa7 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -791,7 +791,6 @@ struct intel_svm {
u32 pasid;
int gpasid; /* In case that guest PASID is different from host PASID */
struct list_head devs;
- struct list_head list;
};
#else
static inline void intel_svm_check(struct intel_iommu *iommu) {}
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index d760f587c229..fa6223bf1e7b 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5411,6 +5411,9 @@ intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
if (!info)
return -EINVAL;
+ if (intel_iommu_enable_pasid(info->iommu, dev))
+ return -ENODEV;
+
if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
return -EINVAL;
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 82b0627ad7e7..da4310686ed3 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -23,9 +23,11 @@
#include <asm/fpu/api.h>
#include "pasid.h"
+#include "../iommu-sva-lib.h"
static irqreturn_t prq_event_thread(int irq, void *d);
static void intel_svm_drain_prq(struct device *dev, u32 pasid);
+#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
#define PRQ_ORDER 0
@@ -222,7 +224,6 @@ static const struct mmu_notifier_ops intel_mmuops = {
};
static DEFINE_MUTEX(pasid_mutex);
-static LIST_HEAD(global_svm_list);
#define for_each_svm_dev(sdev, svm, d) \
list_for_each_entry((sdev), &(svm)->devs, list) \
@@ -477,79 +478,80 @@ static void load_pasid(struct mm_struct *mm, u32 pasid)
mutex_unlock(&mm->context.lock);
}
-/* Caller must hold pasid_mutex, mm reference */
-static int
-intel_svm_bind_mm(struct device *dev, unsigned int flags,
- struct mm_struct *mm, struct intel_svm_dev **sd)
+static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm,
+ unsigned int flags)
{
- struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
- struct intel_svm *svm = NULL, *t;
- struct device_domain_info *info;
- struct intel_svm_dev *sdev;
- unsigned long iflags;
- int pasid_max;
- int ret;
+ ioasid_t max_pasid = dev_is_pci(dev) ?
+ pci_max_pasids(to_pci_dev(dev)) : intel_pasid_max_id;
- if (!iommu || dmar_disabled)
- return -EINVAL;
+ return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1);
+}
- if (!intel_svm_capable(iommu))
- return -ENOTSUPP;
+static void intel_svm_free_pasid(struct mm_struct *mm)
+{
+ iommu_sva_free_pasid(mm);
+}
- if (dev_is_pci(dev)) {
- pasid_max = pci_max_pasids(to_pci_dev(dev));
- if (pasid_max < 0)
- return -EINVAL;
- } else
- pasid_max = 1 << 20;
+static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
+ struct device *dev,
+ struct mm_struct *mm,
+ unsigned int flags)
+{
+ struct device_domain_info *info = get_domain_info(dev);
+ unsigned long iflags, sflags;
+ struct intel_svm_dev *sdev;
+ struct intel_svm *svm;
+ int ret = 0;
- /* Bind supervisor PASID shuld have mm = NULL */
- if (flags & SVM_FLAG_SUPERVISOR_MODE) {
- if (!ecap_srs(iommu->ecap) || mm) {
- pr_err("Supervisor PASID with user provided mm.\n");
- return -EINVAL;
- }
- }
+ svm = pasid_private_find(mm->pasid);
+ if (!svm) {
+ svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+ if (!svm)
+ return ERR_PTR(-ENOMEM);
- list_for_each_entry(t, &global_svm_list, list) {
- if (t->mm != mm)
- continue;
+ svm->pasid = mm->pasid;
+ svm->mm = mm;
+ svm->flags = flags;
+ INIT_LIST_HEAD_RCU(&svm->devs);
- svm = t;
- if (svm->pasid >= pasid_max) {
- dev_warn(dev,
- "Limited PASID width. Cannot use existing PASID %d\n",
- svm->pasid);
- ret = -ENOSPC;
- goto out;
+ if (!(flags & SVM_FLAG_SUPERVISOR_MODE)) {
+ svm->notifier.ops = &intel_mmuops;
+ ret = mmu_notifier_register(&svm->notifier, mm);
+ if (ret) {
+ kfree(svm);
+ return ERR_PTR(ret);
+ }
}
- /* Find the matching device in svm list */
- for_each_svm_dev(sdev, svm, dev) {
- sdev->users++;
- goto success;
+ ret = pasid_private_add(svm->pasid, svm);
+ if (ret) {
+ if (svm->notifier.ops)
+ mmu_notifier_unregister(&svm->notifier, mm);
+ kfree(svm);
+ return ERR_PTR(ret);
}
+ }
- break;
+ /* Find the matching device in svm list */
+ for_each_svm_dev(sdev, svm, dev) {
+ sdev->users++;
+ goto success;
}
sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
if (!sdev) {
ret = -ENOMEM;
- goto out;
+ goto free_svm;
}
+
sdev->dev = dev;
sdev->iommu = iommu;
-
- ret = intel_iommu_enable_pasid(iommu, dev);
- if (ret) {
- kfree(sdev);
- goto out;
- }
-
- info = get_domain_info(dev);
sdev->did = FLPT_DEFAULT_DID;
sdev->sid = PCI_DEVID(info->bus, info->devfn);
+ sdev->users = 1;
+ sdev->pasid = svm->pasid;
+ sdev->sva.dev = dev;
+ init_rcu_head(&sdev->rcu);
if (info->ats_enabled) {
sdev->dev_iotlb = 1;
sdev->qdep = info->ats_qdep;
@@ -557,96 +559,37 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
sdev->qdep = 0;
}
- /* Finish the setup now we know we're keeping it */
- sdev->users = 1;
- init_rcu_head(&sdev->rcu);
-
- if (!svm) {
- svm = kzalloc(sizeof(*svm), GFP_KERNEL);
- if (!svm) {
- ret = -ENOMEM;
- goto sdev_err;
- }
-
- if (pasid_max > intel_pasid_max_id)
- pasid_max = intel_pasid_max_id;
-
- /* Do not use PASID 0, reserved for RID to PASID */
- svm->pasid = ioasid_alloc(NULL, PASID_MIN,
- pasid_max - 1, NULL);
- if (svm->pasid == INVALID_IOASID) {
- ret = -ENOSPC;
- goto svm_err;
- }
-
- ret = pasid_private_add(svm->pasid, svm);
- if (ret)
- goto pasid_err;
+ /* Setup the pasid table: */
+ sflags = (flags & SVM_FLAG_SUPERVISOR_MODE) ?
+ PASID_FLAG_SUPERVISOR_MODE : 0;
+ sflags |= cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
+ spin_lock_irqsave(&iommu->lock, iflags);
+ ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid,
+ FLPT_DEFAULT_DID, sflags);
+ spin_unlock_irqrestore(&iommu->lock, iflags);
- svm->notifier.ops = &intel_mmuops;
- svm->mm = mm;
- svm->flags = flags;
- INIT_LIST_HEAD_RCU(&svm->devs);
- INIT_LIST_HEAD(&svm->list);
- ret = -ENOMEM;
- if (mm) {
- ret = mmu_notifier_register(&svm->notifier, mm);
- if (ret)
- goto priv_err;
- }
+ if (ret)
+ goto free_sdev;
- spin_lock_irqsave(&iommu->lock, iflags);
- ret = intel_pasid_setup_first_level(iommu, dev,
- mm ? mm->pgd : init_mm.pgd,
- svm->pasid, FLPT_DEFAULT_DID,
- (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
- (cpu_feature_enabled(X86_FEATURE_LA57) ?
- PASID_FLAG_FL5LP : 0));
- spin_unlock_irqrestore(&iommu->lock, iflags);
- if (ret) {
- if (mm)
- mmu_notifier_unregister(&svm->notifier, mm);
-priv_err:
- pasid_private_remove(svm->pasid);
-pasid_err:
- ioasid_put(svm->pasid);
-svm_err:
- kfree(svm);
-sdev_err:
- kfree(sdev);
- goto out;
- }
+ /* The newly allocated pasid is loaded to the mm. */
+ if (!(flags & SVM_FLAG_SUPERVISOR_MODE) && list_empty(&svm->devs))
+ load_pasid(mm, svm->pasid);
- list_add_tail(&svm->list, &global_svm_list);
- if (mm) {
- /* The newly allocated pasid is loaded to the mm. */
- load_pasid(mm, svm->pasid);
- }
- } else {
- /*
- * Binding a new device with existing PASID, need to setup
- * the PASID entry.
- */
- spin_lock_irqsave(&iommu->lock, iflags);
- ret = intel_pasid_setup_first_level(iommu, dev,
- mm ? mm->pgd : init_mm.pgd,
- svm->pasid, FLPT_DEFAULT_DID,
- (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
- (cpu_feature_enabled(X86_FEATURE_LA57) ?
- PASID_FLAG_FL5LP : 0));
- spin_unlock_irqrestore(&iommu->lock, iflags);
- if (ret)
- goto sdev_err;
- }
list_add_rcu(&sdev->list, &svm->devs);
success:
- sdev->pasid = svm->pasid;
- sdev->sva.dev = dev;
- if (sd)
- *sd = sdev;
- ret = 0;
-out:
- return ret;
+ return &sdev->sva;
+
+free_sdev:
+ kfree(sdev);
+free_svm:
+ if (list_empty(&svm->devs)) {
+ if (svm->notifier.ops)
+ mmu_notifier_unregister(&svm->notifier, mm);
+ pasid_private_remove(mm->pasid);
+ kfree(svm);
+ }
+
+ return ERR_PTR(ret);
}
/* Caller must hold pasid_mutex */
@@ -655,6 +598,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
struct intel_svm_dev *sdev;
struct intel_iommu *iommu;
struct intel_svm *svm;
+ struct mm_struct *mm;
int ret = -EINVAL;
iommu = device_to_iommu(dev, NULL, NULL);
@@ -664,6 +608,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
if (ret)
goto out;
+ mm = svm->mm;
if (sdev) {
sdev->users--;
@@ -682,13 +627,12 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
kfree_rcu(sdev, rcu);
if (list_empty(&svm->devs)) {
- ioasid_put(svm->pasid);
- if (svm->mm) {
- mmu_notifier_unregister(&svm->notifier, svm->mm);
+ intel_svm_free_pasid(mm);
+ if (svm->notifier.ops) {
+ mmu_notifier_unregister(&svm->notifier, mm);
/* Clear mm's pasid. */
- load_pasid(svm->mm, PASID_DISABLED);
+ load_pasid(mm, PASID_DISABLED);
}
- list_del(&svm->list);
pasid_private_remove(svm->pasid);
/* We mandate that no page faults may be outstanding
* for the PASID when intel_svm_unbind_mm() is called.
@@ -1073,31 +1017,42 @@ static irqreturn_t prq_event_thread(int irq, void *d)
return IRQ_RETVAL(handled);
}
-#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
-struct iommu_sva *
-intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
{
- struct iommu_sva *sva = ERR_PTR(-EINVAL);
- struct intel_svm_dev *sdev = NULL;
+ struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
unsigned int flags = 0;
+ struct iommu_sva *sva;
int ret;
- /*
- * TODO: Consolidate with generic iommu-sva bind after it is merged.
- * It will require shared SVM data structures, i.e. combine io_mm
- * and intel_svm etc.
- */
if (drvdata)
flags = *(unsigned int *)drvdata;
+
+ if (flags & SVM_FLAG_SUPERVISOR_MODE) {
+ if (!ecap_srs(iommu->ecap)) {
+ dev_err(dev, "%s: Supervisor PASID not supported\n",
+ iommu->name);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ if (mm) {
+ dev_err(dev, "%s: Supervisor PASID with user provided mm\n",
+ iommu->name);
+ return ERR_PTR(-EINVAL);
+ }
+
+ mm = &init_mm;
+ }
+
mutex_lock(&pasid_mutex);
- ret = intel_svm_bind_mm(dev, flags, mm, &sdev);
- if (ret)
- sva = ERR_PTR(ret);
- else if (sdev)
- sva = &sdev->sva;
- else
- WARN(!sdev, "SVM bind succeeded with no sdev!\n");
+ ret = intel_svm_alloc_pasid(dev, mm, flags);
+ if (ret) {
+ mutex_unlock(&pasid_mutex);
+ return ERR_PTR(ret);
+ }
+ sva = intel_svm_bind_mm(iommu, dev, mm, flags);
+ if (IS_ERR_OR_NULL(sva))
+ intel_svm_free_pasid(mm);
mutex_unlock(&pasid_mutex);
return sva;
@@ -1105,10 +1060,9 @@ intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
void intel_svm_unbind(struct iommu_sva *sva)
{
- struct intel_svm_dev *sdev;
+ struct intel_svm_dev *sdev = to_intel_svm_dev(sva);
mutex_lock(&pasid_mutex);
- sdev = to_intel_svm_dev(sva);
intel_svm_unbind_mm(sdev->dev, sdev->pasid);
mutex_unlock(&pasid_mutex);
}
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 7e5b240b801d..a37bd54c5b90 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -42,6 +42,7 @@ config INTEL_IOMMU_SVM
select PCI_PRI
select MMU_NOTIFIER
select IOASID
+ select IOMMU_SVA_LIB
help
Shared Virtual Memory (SVM) provides a facility for devices
to access DMA resources through process address space by
--
2.25.1
It's common to iterate the svm device list and find a matched device. Add
common helpers to do this and consolidate the code.
Signed-off-by: Lu Baolu <[email protected]>
---
drivers/iommu/intel/svm.c | 68 +++++++++++++++++++++++----------------
1 file changed, 40 insertions(+), 28 deletions(-)
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index da4310686ed3..57867ff97bc2 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -48,6 +48,40 @@ static void *pasid_private_find(ioasid_t pasid)
return xa_load(&pasid_private_array, pasid);
}
+static struct intel_svm_dev *
+svm_lookup_device_by_sid(struct intel_svm *svm, u16 sid)
+{
+ struct intel_svm_dev *sdev = NULL, *t;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(t, &svm->devs, list) {
+ if (t->sid == sid) {
+ sdev = t;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return sdev;
+}
+
+static struct intel_svm_dev *
+svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev)
+{
+ struct intel_svm_dev *sdev = NULL, *t;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(t, &svm->devs, list) {
+ if (t->dev == dev) {
+ sdev = t;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return sdev;
+}
+
int intel_svm_enable_prq(struct intel_iommu *iommu)
{
struct page *pages;
@@ -225,15 +259,11 @@ static const struct mmu_notifier_ops intel_mmuops = {
static DEFINE_MUTEX(pasid_mutex);
-#define for_each_svm_dev(sdev, svm, d) \
- list_for_each_entry((sdev), &(svm)->devs, list) \
- if ((d) != (sdev)->dev) {} else
-
static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
struct intel_svm **rsvm,
struct intel_svm_dev **rsdev)
{
- struct intel_svm_dev *d, *sdev = NULL;
+ struct intel_svm_dev *sdev = NULL;
struct intel_svm *svm;
/* The caller should hold the pasid_mutex lock */
@@ -256,15 +286,7 @@ static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
*/
if (WARN_ON(list_empty(&svm->devs)))
return -EINVAL;
-
- rcu_read_lock();
- list_for_each_entry_rcu(d, &svm->devs, list) {
- if (d->dev == dev) {
- sdev = d;
- break;
- }
- }
- rcu_read_unlock();
+ sdev = svm_lookup_device_by_dev(svm, dev);
out:
*rsvm = svm;
@@ -533,7 +555,8 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
}
/* Find the matching device in svm list */
- for_each_svm_dev(sdev, svm, dev) {
+ sdev = svm_lookup_device_by_dev(svm, dev);
+ if (sdev) {
sdev->users++;
goto success;
}
@@ -901,19 +924,8 @@ static irqreturn_t prq_event_thread(int irq, void *d)
}
}
- if (!sdev || sdev->sid != req->rid) {
- struct intel_svm_dev *t;
-
- sdev = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(t, &svm->devs, list) {
- if (t->sid == req->rid) {
- sdev = t;
- break;
- }
- }
- rcu_read_unlock();
- }
+ if (!sdev || sdev->sid != req->rid)
+ sdev = svm_lookup_device_by_sid(svm, req->rid);
/* Since we're using init_mm.pgd directly, we should never take
* any faults on kernel addresses. */
--
2.25.1
Refactor prq_event_thread() by moving handling single prq event out of
the main loop.
Signed-off-by: Lu Baolu <[email protected]>
---
drivers/iommu/intel/svm.c | 239 ++++++++++++++++++++++----------------
1 file changed, 136 insertions(+), 103 deletions(-)
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 57867ff97bc2..d51ddece4259 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -866,141 +866,174 @@ intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
return iommu_report_device_fault(dev, &event);
}
+static void handle_bad_prq_event(struct intel_iommu *iommu,
+ struct page_req_dsc *req, int result)
+{
+ struct qi_desc desc;
+
+ pr_err("%s: Invalid page request: %08llx %08llx\n",
+ iommu->name, ((unsigned long long *)req)[0],
+ ((unsigned long long *)req)[1]);
+
+ /*
+ * Per VT-d spec. v3.0 ch7.7, system software must
+ * respond with page group response if private data
+ * is present (PDP) or last page in group (LPIG) bit
+ * is set. This is an additional VT-d feature beyond
+ * PCI ATS spec.
+ */
+ if (!req->lpig && !req->priv_data_present)
+ return;
+
+ desc.qw0 = QI_PGRP_PASID(req->pasid) |
+ QI_PGRP_DID(req->rid) |
+ QI_PGRP_PASID_P(req->pasid_present) |
+ QI_PGRP_PDP(req->priv_data_present) |
+ QI_PGRP_RESP_CODE(result) |
+ QI_PGRP_RESP_TYPE;
+ desc.qw1 = QI_PGRP_IDX(req->prg_index) |
+ QI_PGRP_LPIG(req->lpig);
+ desc.qw2 = 0;
+ desc.qw3 = 0;
+
+ if (req->priv_data_present)
+ memcpy(&desc.qw2, req->priv_data, sizeof(req->priv_data));
+ qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+static void handle_single_prq_event(struct intel_iommu *iommu,
+ struct mm_struct *mm,
+ struct page_req_dsc *req)
+{
+ u64 address = (u64)req->addr << VTD_PAGE_SHIFT;
+ int result = QI_RESP_INVALID;
+ struct vm_area_struct *vma;
+ struct qi_desc desc;
+ unsigned int flags;
+ vm_fault_t ret;
+
+ /* If the mm is already defunct, don't handle faults. */
+ if (!mmget_not_zero(mm))
+ goto response;
+
+ mmap_read_lock(mm);
+ vma = find_extend_vma(mm, address);
+ if (!vma || address < vma->vm_start)
+ goto invalid;
+
+ if (access_error(vma, req))
+ goto invalid;
+
+ flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
+ if (req->wr_req)
+ flags |= FAULT_FLAG_WRITE;
+
+ ret = handle_mm_fault(vma, address, flags, NULL);
+ if (!(ret & VM_FAULT_ERROR))
+ result = QI_RESP_SUCCESS;
+invalid:
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+response:
+ if (!(req->lpig || req->priv_data_present))
+ return;
+
+ desc.qw0 = QI_PGRP_PASID(req->pasid) |
+ QI_PGRP_DID(req->rid) |
+ QI_PGRP_PASID_P(req->pasid_present) |
+ QI_PGRP_PDP(req->priv_data_present) |
+ QI_PGRP_RESP_CODE(result) |
+ QI_PGRP_RESP_TYPE;
+ desc.qw1 = QI_PGRP_IDX(req->prg_index) |
+ QI_PGRP_LPIG(req->lpig);
+ desc.qw2 = 0;
+ desc.qw3 = 0;
+
+ if (req->priv_data_present)
+ memcpy(&desc.qw2, req->priv_data, sizeof(req->priv_data));
+
+ qi_submit_sync(iommu, &desc, 1, 0);
+}
+
static irqreturn_t prq_event_thread(int irq, void *d)
{
struct intel_svm_dev *sdev = NULL;
struct intel_iommu *iommu = d;
struct intel_svm *svm = NULL;
- int head, tail, handled = 0;
- unsigned int flags = 0;
+ struct page_req_dsc *req;
+ int head, tail, handled;
+ u64 address;
- /* Clear PPR bit before reading head/tail registers, to
- * ensure that we get a new interrupt if needed. */
+ /*
+ * Clear PPR bit before reading head/tail registers, to ensure that
+ * we get a new interrupt if needed.
+ */
writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+ handled = (head != tail);
while (head != tail) {
- struct vm_area_struct *vma;
- struct page_req_dsc *req;
- struct qi_desc resp;
- int result;
- vm_fault_t ret;
- u64 address;
-
- handled = 1;
req = &iommu->prq[head / sizeof(*req)];
- result = QI_RESP_INVALID;
address = (u64)req->addr << VTD_PAGE_SHIFT;
- if (!req->pasid_present) {
- pr_err("%s: Page request without PASID: %08llx %08llx\n",
- iommu->name, ((unsigned long long *)req)[0],
- ((unsigned long long *)req)[1]);
- goto no_pasid;
+
+ if (unlikely(!req->pasid_present)) {
+ pr_err("IOMMU: %s: Page request without PASID\n",
+ iommu->name);
+bad_req:
+ svm = NULL;
+ sdev = NULL;
+ handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
+ goto prq_advance;
}
- /* We shall not receive page request for supervisor SVM */
- if (req->pm_req && (req->rd_req | req->wr_req)) {
- pr_err("Unexpected page request in Privilege Mode");
- /* No need to find the matching sdev as for bad_req */
- goto no_pasid;
+
+ if (unlikely(!is_canonical_address(address))) {
+ pr_err("IOMMU: %s: Address is not canonical\n",
+ iommu->name);
+ goto bad_req;
+ }
+
+ if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) {
+ pr_err("IOMMU: %s: Page request in Privilege Mode\n",
+ iommu->name);
+ goto bad_req;
}
- /* DMA read with exec requeset is not supported. */
- if (req->exe_req && req->rd_req) {
- pr_err("Execution request not supported\n");
- goto no_pasid;
+
+ if (unlikely(req->exe_req && req->rd_req)) {
+ pr_err("IOMMU: %s: Execution request not supported\n",
+ iommu->name);
+ goto bad_req;
}
+
if (!svm || svm->pasid != req->pasid) {
- rcu_read_lock();
- svm = pasid_private_find(req->pasid);
- /* It *can't* go away, because the driver is not permitted
+ /*
+ * It can't go away, because the driver is not permitted
* to unbind the mm while any page faults are outstanding.
- * So we only need RCU to protect the internal idr code. */
- rcu_read_unlock();
- if (IS_ERR_OR_NULL(svm)) {
- pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
- iommu->name, req->pasid, ((unsigned long long *)req)[0],
- ((unsigned long long *)req)[1]);
- goto no_pasid;
- }
+ */
+ svm = pasid_private_find(req->pasid);
+ if (IS_ERR_OR_NULL(svm) || (svm->flags & SVM_FLAG_SUPERVISOR_MODE))
+ goto bad_req;
}
- if (!sdev || sdev->sid != req->rid)
+ if (!sdev || sdev->sid != req->rid) {
sdev = svm_lookup_device_by_sid(svm, req->rid);
-
- /* Since we're using init_mm.pgd directly, we should never take
- * any faults on kernel addresses. */
- if (!svm->mm)
- goto bad_req;
-
- /* If address is not canonical, return invalid response */
- if (!is_canonical_address(address))
- goto bad_req;
+ if (!sdev)
+ goto bad_req;
+ }
/*
* If prq is to be handled outside iommu driver via receiver of
* the fault notifiers, we skip the page response here.
*/
if (svm->flags & SVM_FLAG_GUEST_MODE) {
- if (sdev && !intel_svm_prq_report(sdev->dev, req))
+ if (!intel_svm_prq_report(sdev->dev, req))
goto prq_advance;
else
goto bad_req;
}
- /* If the mm is already defunct, don't handle faults. */
- if (!mmget_not_zero(svm->mm))
- goto bad_req;
-
- mmap_read_lock(svm->mm);
- vma = find_extend_vma(svm->mm, address);
- if (!vma || address < vma->vm_start)
- goto invalid;
-
- if (access_error(vma, req))
- goto invalid;
-
- flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
- if (req->wr_req)
- flags |= FAULT_FLAG_WRITE;
-
- ret = handle_mm_fault(vma, address, flags, NULL);
- if (ret & VM_FAULT_ERROR)
- goto invalid;
-
- result = QI_RESP_SUCCESS;
-invalid:
- mmap_read_unlock(svm->mm);
- mmput(svm->mm);
-bad_req:
- /* We get here in the error case where the PASID lookup failed,
- and these can be NULL. Do not use them below this point! */
- sdev = NULL;
- svm = NULL;
-no_pasid:
- if (req->lpig || req->priv_data_present) {
- /*
- * Per VT-d spec. v3.0 ch7.7, system software must
- * respond with page group response if private data
- * is present (PDP) or last page in group (LPIG) bit
- * is set. This is an additional VT-d feature beyond
- * PCI ATS spec.
- */
- resp.qw0 = QI_PGRP_PASID(req->pasid) |
- QI_PGRP_DID(req->rid) |
- QI_PGRP_PASID_P(req->pasid_present) |
- QI_PGRP_PDP(req->priv_data_present) |
- QI_PGRP_RESP_CODE(result) |
- QI_PGRP_RESP_TYPE;
- resp.qw1 = QI_PGRP_IDX(req->prg_index) |
- QI_PGRP_LPIG(req->lpig);
- resp.qw2 = 0;
- resp.qw3 = 0;
-
- if (req->priv_data_present)
- memcpy(&resp.qw2, req->priv_data,
- sizeof(req->priv_data));
- qi_submit_sync(iommu, &resp, 1, 0);
- }
+ handle_single_prq_event(iommu, svm->mm, req);
prq_advance:
head = (head + sizeof(*req)) & PRQ_RING_MASK;
}
--
2.25.1
This allocates and registers the iopf queue infrastructure for devices
which want to support IO page fault for SVA.
Signed-off-by: Lu Baolu <[email protected]>
---
include/linux/intel-iommu.h | 2 ++
drivers/iommu/intel/iommu.c | 66 ++++++++++++++++++++++++++-----------
drivers/iommu/intel/svm.c | 37 +++++++++++++++++----
3 files changed, 79 insertions(+), 26 deletions(-)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 4e8bb186daa7..222520d149c1 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -606,6 +606,8 @@ struct intel_iommu {
struct completion prq_complete;
struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */
#endif
+ struct iopf_queue *iopf_queue;
+ unsigned char iopfq_name[16];
struct q_inval *qi; /* Queued invalidation info */
u32 *iommu_state; /* Store iommu states between suspend and resume.*/
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index fa6223bf1e7b..e72d8b10b2a1 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -46,6 +46,7 @@
#include <asm/iommu.h>
#include "../irq_remapping.h"
+#include "../iommu-sva-lib.h"
#include "pasid.h"
#include "cap_audit.h"
@@ -5338,6 +5339,34 @@ static int intel_iommu_disable_auxd(struct device *dev)
return 0;
}
+static int intel_iommu_enable_sva(struct device *dev)
+{
+ struct device_domain_info *info = get_domain_info(dev);
+ struct intel_iommu *iommu = info->iommu;
+
+ if (!info || !iommu || dmar_disabled)
+ return -EINVAL;
+
+ if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
+ return -EINVAL;
+
+ if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
+ return -ENODEV;
+
+ if (intel_iommu_enable_pasid(iommu, dev))
+ return -ENODEV;
+
+ return iopf_queue_add_device(iommu->iopf_queue, dev);
+}
+
+static int intel_iommu_disable_sva(struct device *dev)
+{
+ struct device_domain_info *info = get_domain_info(dev);
+ struct intel_iommu *iommu = info->iommu;
+
+ return iopf_queue_remove_device(iommu->iopf_queue, dev);
+}
+
/*
* A PCI express designated vendor specific extended capability is defined
* in the section 3.7 of Intel scalable I/O virtualization technical spec
@@ -5399,38 +5428,37 @@ intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
static int
intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
{
- if (feat == IOMMU_DEV_FEAT_AUX)
+ switch (feat) {
+ case IOMMU_DEV_FEAT_AUX:
return intel_iommu_enable_auxd(dev);
- if (feat == IOMMU_DEV_FEAT_IOPF)
+ case IOMMU_DEV_FEAT_IOPF:
return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
- if (feat == IOMMU_DEV_FEAT_SVA) {
- struct device_domain_info *info = get_domain_info(dev);
-
- if (!info)
- return -EINVAL;
-
- if (intel_iommu_enable_pasid(info->iommu, dev))
- return -ENODEV;
+ case IOMMU_DEV_FEAT_SVA:
+ return intel_iommu_enable_sva(dev);
- if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
- return -EINVAL;
-
- if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
- return 0;
+ default:
+ return -ENODEV;
}
-
- return -ENODEV;
}
static int
intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
{
- if (feat == IOMMU_DEV_FEAT_AUX)
+ switch (feat) {
+ case IOMMU_DEV_FEAT_AUX:
return intel_iommu_disable_auxd(dev);
- return -ENODEV;
+ case IOMMU_DEV_FEAT_IOPF:
+ return 0;
+
+ case IOMMU_DEV_FEAT_SVA:
+ return intel_iommu_disable_sva(dev);
+
+ default:
+ return -ENODEV;
+ }
}
static bool
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index d51ddece4259..4dc3ab36e9ae 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -84,6 +84,7 @@ svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev)
int intel_svm_enable_prq(struct intel_iommu *iommu)
{
+ struct iopf_queue *iopfq;
struct page *pages;
int irq, ret;
@@ -100,13 +101,20 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
iommu->name);
ret = -EINVAL;
- err:
- free_pages((unsigned long)iommu->prq, PRQ_ORDER);
- iommu->prq = NULL;
- return ret;
+ goto free_prq;
}
iommu->pr_irq = irq;
+ snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name),
+ "dmar%d-iopfq", iommu->seq_id);
+ iopfq = iopf_queue_alloc(iommu->iopfq_name);
+ if (!iopfq) {
+ pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name);
+ ret = -ENOMEM;
+ goto free_hwirq;
+ }
+ iommu->iopf_queue = iopfq;
+
snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
@@ -114,9 +122,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
if (ret) {
pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
iommu->name);
- dmar_free_hwirq(irq);
- iommu->pr_irq = 0;
- goto err;
+ goto free_iopfq;
}
dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
@@ -125,6 +131,18 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
init_completion(&iommu->prq_complete);
return 0;
+
+free_iopfq:
+ iopf_queue_free(iommu->iopf_queue);
+ iommu->iopf_queue = NULL;
+free_hwirq:
+ dmar_free_hwirq(irq);
+ iommu->pr_irq = 0;
+free_prq:
+ free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+ iommu->prq = NULL;
+
+ return ret;
}
int intel_svm_finish_prq(struct intel_iommu *iommu)
@@ -139,6 +157,11 @@ int intel_svm_finish_prq(struct intel_iommu *iommu)
iommu->pr_irq = 0;
}
+ if (iommu->iopf_queue) {
+ iopf_queue_free(iommu->iopf_queue);
+ iommu->iopf_queue = NULL;
+ }
+
free_pages((unsigned long)iommu->prq, PRQ_ORDER);
iommu->prq = NULL;
--
2.25.1
The execution time of some operations is very performance critical, such
as cache invalidation and PRQ processing time. This adds some common code
to monitor the execution time range of those operations. The interfaces
include enabling/disabling, checking status, updating sampling data and
providing a common string format for users.
Signed-off-by: Fenghua Yu <[email protected]>
Signed-off-by: Lu Baolu <[email protected]>
---
include/linux/intel-iommu.h | 1 +
drivers/iommu/intel/perf.h | 73 +++++++++++++++
drivers/iommu/intel/perf.c | 166 +++++++++++++++++++++++++++++++++++
drivers/iommu/intel/Kconfig | 3 +
drivers/iommu/intel/Makefile | 1 +
5 files changed, 244 insertions(+)
create mode 100644 drivers/iommu/intel/perf.h
create mode 100644 drivers/iommu/intel/perf.c
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 98b04fa9373e..f5cf31dd7280 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -621,6 +621,7 @@ struct intel_iommu {
u32 flags; /* Software defined flags */
struct dmar_drhd_unit *drhd;
+ void *perf_statistic;
};
/* Per subdevice private data */
diff --git a/drivers/iommu/intel/perf.h b/drivers/iommu/intel/perf.h
new file mode 100644
index 000000000000..fd6db8049d1a
--- /dev/null
+++ b/drivers/iommu/intel/perf.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * perf.h - performance monitor header
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Lu Baolu <[email protected]>
+ */
+
+enum latency_type {
+ DMAR_LATENCY_INV_IOTLB = 0,
+ DMAR_LATENCY_INV_DEVTLB,
+ DMAR_LATENCY_INV_IEC,
+ DMAR_LATENCY_PRQ,
+ DMAR_LATENCY_NUM
+};
+
+enum latency_count {
+ COUNTS_10e2 = 0, /* < 0.1us */
+ COUNTS_10e3, /* 0.1us ~ 1us */
+ COUNTS_10e4, /* 1us ~ 10us */
+ COUNTS_10e5, /* 10us ~ 100us */
+ COUNTS_10e6, /* 100us ~ 1ms */
+ COUNTS_10e7, /* 1ms ~ 10ms */
+ COUNTS_10e8_plus, /* 10ms and plus*/
+ COUNTS_MIN,
+ COUNTS_MAX,
+ COUNTS_SUM,
+ COUNTS_NUM
+};
+
+struct latency_statistic {
+ bool enabled;
+ u64 counter[COUNTS_NUM];
+ u64 samples;
+};
+
+#ifdef CONFIG_DMAR_PERF
+int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type);
+void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type);
+bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type);
+void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type,
+ u64 latency);
+int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size);
+#else
+static inline int
+dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type)
+{
+ return -EINVAL;
+}
+
+static inline void
+dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type)
+{
+}
+
+static inline bool
+dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type)
+{
+ return false;
+}
+
+static inline void
+dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency)
+{
+}
+
+static inline int
+dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
+{
+ return 0;
+}
+#endif /* CONFIG_DMAR_PERF */
diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c
new file mode 100644
index 000000000000..faaa96dda437
--- /dev/null
+++ b/drivers/iommu/intel/perf.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * perf.c - performance monitor
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Lu Baolu <[email protected]>
+ * Fenghua Yu <[email protected]>
+ */
+
+#include <linux/spinlock.h>
+#include <linux/intel-iommu.h>
+
+#include "perf.h"
+
+static DEFINE_SPINLOCK(latency_lock);
+
+bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type)
+{
+ struct latency_statistic *lstat = iommu->perf_statistic;
+
+ return lstat && lstat[type].enabled;
+}
+
+int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type)
+{
+ struct latency_statistic *lstat;
+ unsigned long flags;
+ int ret = -EBUSY;
+
+ if (dmar_latency_enabled(iommu, type))
+ return 0;
+
+ spin_lock_irqsave(&latency_lock, flags);
+ if (!iommu->perf_statistic) {
+ iommu->perf_statistic = kzalloc(sizeof(*lstat) * DMAR_LATENCY_NUM,
+ GFP_ATOMIC);
+ if (!iommu->perf_statistic) {
+ ret = -ENOMEM;
+ goto unlock_out;
+ }
+ }
+
+ lstat = iommu->perf_statistic;
+
+ if (!lstat[type].enabled) {
+ lstat[type].enabled = true;
+ lstat[type].counter[COUNTS_MIN] = UINT_MAX;
+ ret = 0;
+ }
+unlock_out:
+ spin_unlock_irqrestore(&latency_lock, flags);
+
+ return ret;
+}
+
+void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type)
+{
+ struct latency_statistic *lstat = iommu->perf_statistic;
+ unsigned long flags;
+
+ if (!dmar_latency_enabled(iommu, type))
+ return;
+
+ spin_lock_irqsave(&latency_lock, flags);
+ memset(&lstat[type], 0, sizeof(*lstat) * DMAR_LATENCY_NUM);
+ spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency)
+{
+ struct latency_statistic *lstat = iommu->perf_statistic;
+ unsigned long flags;
+ u64 min, max;
+
+ if (!dmar_latency_enabled(iommu, type))
+ return;
+
+ spin_lock_irqsave(&latency_lock, flags);
+ if (latency < 100)
+ lstat[type].counter[COUNTS_10e2]++;
+ else if (latency < 1000)
+ lstat[type].counter[COUNTS_10e3]++;
+ else if (latency < 10000)
+ lstat[type].counter[COUNTS_10e4]++;
+ else if (latency < 100000)
+ lstat[type].counter[COUNTS_10e5]++;
+ else if (latency < 1000000)
+ lstat[type].counter[COUNTS_10e6]++;
+ else if (latency < 10000000)
+ lstat[type].counter[COUNTS_10e7]++;
+ else
+ lstat[type].counter[COUNTS_10e8_plus]++;
+
+ min = lstat[type].counter[COUNTS_MIN];
+ max = lstat[type].counter[COUNTS_MAX];
+ lstat[type].counter[COUNTS_MIN] = min_t(u64, min, latency);
+ lstat[type].counter[COUNTS_MAX] = max_t(u64, max, latency);
+ lstat[type].counter[COUNTS_SUM] += latency;
+ lstat[type].samples++;
+ spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static char *latency_counter_names[] = {
+ " <0.1us",
+ " 0.1us-1us", " 1us-10us", " 10us-100us",
+ " 100us-1ms", " 1ms-10ms", " >=10ms",
+ " min(us)", " max(us)", " average(us)"
+};
+
+static char *latency_type_names[] = {
+ " inv_iotlb", " inv_devtlb", " inv_iec",
+ " svm_prq"
+};
+
+int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
+{
+ struct latency_statistic *lstat = iommu->perf_statistic;
+ unsigned long flags;
+ int bytes = 0, i, j;
+
+ memset(str, 0, size);
+
+ for (i = 0; i < COUNTS_NUM; i++)
+ bytes += snprintf(str + bytes, size - bytes,
+ "%s", latency_counter_names[i]);
+
+ spin_lock_irqsave(&latency_lock, flags);
+ for (i = 0; i < DMAR_LATENCY_NUM; i++) {
+ if (!dmar_latency_enabled(iommu, i))
+ continue;
+
+ bytes += snprintf(str + bytes, size - bytes,
+ "\n%s", latency_type_names[i]);
+
+ for (j = 0; j < COUNTS_NUM; j++) {
+ u64 val = lstat[i].counter[j];
+
+ switch (j) {
+ case COUNTS_MIN:
+ if (val == UINT_MAX)
+ val = 0;
+ else
+ val /= 1000;
+ break;
+ case COUNTS_MAX:
+ val /= 1000;
+ break;
+ case COUNTS_SUM:
+ if (lstat[i].samples)
+ val /= (lstat[i].samples * 1000);
+ else
+ val = 0;
+ break;
+ default:
+ break;
+ }
+
+ bytes += snprintf(str + bytes, size - bytes,
+ "%12lld", val);
+ }
+ }
+ spin_unlock_irqrestore(&latency_lock, flags);
+
+ return bytes;
+}
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index a37bd54c5b90..59be5447b775 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -3,6 +3,9 @@
config DMAR_TABLE
bool
+config DMAR_PERF
+ bool
+
config INTEL_IOMMU
bool "Support for Intel IOMMU using DMA Remapping Devices"
depends on PCI_MSI && ACPI && (X86 || IA64)
diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
index ae236ec7d219..fa0dae16441c 100644
--- a/drivers/iommu/intel/Makefile
+++ b/drivers/iommu/intel/Makefile
@@ -2,6 +2,7 @@
obj-$(CONFIG_DMAR_TABLE) += dmar.o
obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o
+obj-$(CONFIG_DMAR_PERF) += perf.o
obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
--
2.25.1
A debugfs interface /sys/kernel/debug/iommu/intel/dmar_perf_latency is
created to control and show counts of execution time ranges for various
types per DMAR. The interface may help debug any potential performance
issue.
By default, the interface is disabled.
Possible write value of /sys/kernel/debug/iommu/intel/dmar_perf_latency
0 - disable sampling all latency data
1 - enable sampling IOTLB invalidation latency data
2 - enable sampling devTLB invalidation latency data
3 - enable sampling intr entry cache invalidation latency data
4 - enable sampling prq handling latency data
Read /sys/kernel/debug/iommu/intel/dmar_perf_latency gives a snapshot
of sampling result of all enabled monitors.
Signed-off-by: Fenghua Yu <[email protected]>
Signed-off-by: Lu Baolu <[email protected]>
---
drivers/iommu/intel/debugfs.c | 111 ++++++++++++++++++++++++++++++++++
drivers/iommu/intel/Kconfig | 1 +
2 files changed, 112 insertions(+)
diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c
index efea7f02abd9..62e23ff3c987 100644
--- a/drivers/iommu/intel/debugfs.c
+++ b/drivers/iommu/intel/debugfs.c
@@ -16,6 +16,7 @@
#include <asm/irq_remapping.h>
#include "pasid.h"
+#include "perf.h"
struct tbl_walk {
u16 bus;
@@ -31,6 +32,9 @@ struct iommu_regset {
const char *regs;
};
+#define DEBUG_BUFFER_SIZE 1024
+static char debug_buf[DEBUG_BUFFER_SIZE];
+
#define IOMMU_REGSET_ENTRY(_reg_) \
{ DMAR_##_reg_##_REG, __stringify(_reg_) }
@@ -538,6 +542,111 @@ static int ir_translation_struct_show(struct seq_file *m, void *unused)
DEFINE_SHOW_ATTRIBUTE(ir_translation_struct);
#endif
+static void latency_show_one(struct seq_file *m, struct intel_iommu *iommu,
+ struct dmar_drhd_unit *drhd)
+{
+ int ret;
+
+ seq_printf(m, "IOMMU: %s Register Base Address: %llx\n",
+ iommu->name, drhd->reg_base_addr);
+
+ ret = dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE);
+ if (ret < 0)
+ seq_puts(m, "Failed to get latency snapshot");
+ else
+ seq_puts(m, debug_buf);
+ seq_puts(m, "\n");
+}
+
+static int latency_show(struct seq_file *m, void *v)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd)
+ latency_show_one(m, iommu, drhd);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int dmar_perf_latency_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, latency_show, NULL);
+}
+
+static ssize_t dmar_perf_latency_write(struct file *filp,
+ const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct dmar_drhd_unit *drhd;
+ struct intel_iommu *iommu;
+ int counting;
+ char buf[64];
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ if (kstrtoint(buf, 0, &counting))
+ return -EINVAL;
+
+ switch (counting) {
+ case 0:
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd) {
+ dmar_latency_disable(iommu, DMAR_LATENCY_INV_IOTLB);
+ dmar_latency_disable(iommu, DMAR_LATENCY_INV_DEVTLB);
+ dmar_latency_disable(iommu, DMAR_LATENCY_INV_IEC);
+ dmar_latency_disable(iommu, DMAR_LATENCY_PRQ);
+ }
+ rcu_read_unlock();
+ break;
+ case 1:
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd)
+ dmar_latency_enable(iommu, DMAR_LATENCY_INV_IOTLB);
+ rcu_read_unlock();
+ break;
+ case 2:
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd)
+ dmar_latency_enable(iommu, DMAR_LATENCY_INV_DEVTLB);
+ rcu_read_unlock();
+ break;
+ case 3:
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd)
+ dmar_latency_enable(iommu, DMAR_LATENCY_INV_IEC);
+ rcu_read_unlock();
+ break;
+ case 4:
+ rcu_read_lock();
+ for_each_active_iommu(iommu, drhd)
+ dmar_latency_enable(iommu, DMAR_LATENCY_PRQ);
+ rcu_read_unlock();
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ *ppos += cnt;
+ return cnt;
+}
+
+static const struct file_operations dmar_perf_latency_fops = {
+ .open = dmar_perf_latency_open,
+ .write = dmar_perf_latency_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
void __init intel_iommu_debugfs_init(void)
{
struct dentry *intel_iommu_debug = debugfs_create_dir("intel",
@@ -556,4 +665,6 @@ void __init intel_iommu_debugfs_init(void)
debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug,
NULL, &ir_translation_struct_fops);
#endif
+ debugfs_create_file("dmar_perf_latency", 0644, intel_iommu_debug,
+ NULL, &dmar_perf_latency_fops);
}
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 59be5447b775..43ebd8af11c5 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -28,6 +28,7 @@ config INTEL_IOMMU
config INTEL_IOMMU_DEBUGFS
bool "Export Intel IOMMU internals in Debugfs"
depends on INTEL_IOMMU && IOMMU_DEBUGFS
+ select DMAR_PERF
help
!!!WARNING!!!
--
2.25.1
Let the IO page fault requests get handled through the io-pgfault
framework.
Signed-off-by: Lu Baolu <[email protected]>
---
drivers/iommu/intel/iommu.c | 14 ++++++-
drivers/iommu/intel/svm.c | 84 +++----------------------------------
2 files changed, 17 insertions(+), 81 deletions(-)
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e72d8b10b2a1..f1c2287a3646 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5343,6 +5343,7 @@ static int intel_iommu_enable_sva(struct device *dev)
{
struct device_domain_info *info = get_domain_info(dev);
struct intel_iommu *iommu = info->iommu;
+ int ret;
if (!info || !iommu || dmar_disabled)
return -EINVAL;
@@ -5356,15 +5357,24 @@ static int intel_iommu_enable_sva(struct device *dev)
if (intel_iommu_enable_pasid(iommu, dev))
return -ENODEV;
- return iopf_queue_add_device(iommu->iopf_queue, dev);
+ ret = iopf_queue_add_device(iommu->iopf_queue, dev);
+ if (!ret)
+ ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
+
+ return ret;
}
static int intel_iommu_disable_sva(struct device *dev)
{
struct device_domain_info *info = get_domain_info(dev);
struct intel_iommu *iommu = info->iommu;
+ int ret;
+
+ ret = iommu_unregister_device_fault_handler(dev);
+ if (!ret)
+ ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
- return iopf_queue_remove_device(iommu->iopf_queue, dev);
+ return ret;
}
/*
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 4dc3ab36e9ae..ade157b64ce7 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -724,22 +724,6 @@ struct page_req_dsc {
#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20)
-static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
-{
- unsigned long requested = 0;
-
- if (req->exe_req)
- requested |= VM_EXEC;
-
- if (req->rd_req)
- requested |= VM_READ;
-
- if (req->wr_req)
- requested |= VM_WRITE;
-
- return (requested & ~vma->vm_flags) != 0;
-}
-
static bool is_canonical_address(u64 addr)
{
int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
@@ -809,6 +793,8 @@ static void intel_svm_drain_prq(struct device *dev, u32 pasid)
goto prq_retry;
}
+ iopf_queue_flush_dev(dev);
+
/*
* Perform steps described in VT-d spec CH7.10 to drain page
* requests and responses in hardware.
@@ -924,61 +910,6 @@ static void handle_bad_prq_event(struct intel_iommu *iommu,
qi_submit_sync(iommu, &desc, 1, 0);
}
-static void handle_single_prq_event(struct intel_iommu *iommu,
- struct mm_struct *mm,
- struct page_req_dsc *req)
-{
- u64 address = (u64)req->addr << VTD_PAGE_SHIFT;
- int result = QI_RESP_INVALID;
- struct vm_area_struct *vma;
- struct qi_desc desc;
- unsigned int flags;
- vm_fault_t ret;
-
- /* If the mm is already defunct, don't handle faults. */
- if (!mmget_not_zero(mm))
- goto response;
-
- mmap_read_lock(mm);
- vma = find_extend_vma(mm, address);
- if (!vma || address < vma->vm_start)
- goto invalid;
-
- if (access_error(vma, req))
- goto invalid;
-
- flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
- if (req->wr_req)
- flags |= FAULT_FLAG_WRITE;
-
- ret = handle_mm_fault(vma, address, flags, NULL);
- if (!(ret & VM_FAULT_ERROR))
- result = QI_RESP_SUCCESS;
-invalid:
- mmap_read_unlock(mm);
- mmput(mm);
-
-response:
- if (!(req->lpig || req->priv_data_present))
- return;
-
- desc.qw0 = QI_PGRP_PASID(req->pasid) |
- QI_PGRP_DID(req->rid) |
- QI_PGRP_PASID_P(req->pasid_present) |
- QI_PGRP_PDP(req->priv_data_present) |
- QI_PGRP_RESP_CODE(result) |
- QI_PGRP_RESP_TYPE;
- desc.qw1 = QI_PGRP_IDX(req->prg_index) |
- QI_PGRP_LPIG(req->lpig);
- desc.qw2 = 0;
- desc.qw3 = 0;
-
- if (req->priv_data_present)
- memcpy(&desc.qw2, req->priv_data, sizeof(req->priv_data));
-
- qi_submit_sync(iommu, &desc, 1, 0);
-}
-
static irqreturn_t prq_event_thread(int irq, void *d)
{
struct intel_svm_dev *sdev = NULL;
@@ -1049,14 +980,8 @@ static irqreturn_t prq_event_thread(int irq, void *d)
* If prq is to be handled outside iommu driver via receiver of
* the fault notifiers, we skip the page response here.
*/
- if (svm->flags & SVM_FLAG_GUEST_MODE) {
- if (!intel_svm_prq_report(sdev->dev, req))
- goto prq_advance;
- else
- goto bad_req;
- }
-
- handle_single_prq_event(iommu, svm->mm, req);
+ if (intel_svm_prq_report(sdev->dev, req))
+ handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
prq_advance:
head = (head + sizeof(*req)) & PRQ_RING_MASK;
}
@@ -1073,6 +998,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
if (head == tail) {
+ iopf_queue_discard_partial(iommu->iopf_queue);
writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared",
iommu->name);
--
2.25.1
This adds a new trace event to track the page fault request report.
This event will provide almost all information defined in a page
request descriptor.
A sample output:
| prq_report: dmar0/0000:00:0a.0 seq# 1: rid=0x50 addr=0x559ef6f97 r---- pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 2: rid=0x50 addr=0x559ef6f9c rw--l pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 3: rid=0x50 addr=0x559ef6f98 r---- pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 4: rid=0x50 addr=0x559ef6f9d rw--l pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 5: rid=0x50 addr=0x559ef6f99 r---- pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 6: rid=0x50 addr=0x559ef6f9e rw--l pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 7: rid=0x50 addr=0x559ef6f9a r---- pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 8: rid=0x50 addr=0x559ef6f9f rw--l pasid=0x2 index=0x1
This will be helpful for I/O page fault related debugging.
Signed-off-by: Lu Baolu <[email protected]>
---
include/linux/intel-iommu.h | 29 +++++++++++++++++++++++
include/trace/events/intel_iommu.h | 37 ++++++++++++++++++++++++++++++
drivers/iommu/intel/svm.c | 7 ++++++
3 files changed, 73 insertions(+)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 222520d149c1..98b04fa9373e 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -778,6 +778,7 @@ struct intel_svm_dev {
struct device *dev;
struct intel_iommu *iommu;
struct iommu_sva sva;
+ unsigned long prq_seq_number;
u32 pasid;
int users;
u16 did;
@@ -828,4 +829,32 @@ static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
#define intel_iommu_enabled (0)
#endif
+static inline const char *decode_prq_descriptor(char *str, size_t size,
+ u64 dw0, u64 dw1, u64 dw2, u64 dw3)
+{
+ char *buf = str;
+ int bytes;
+
+ bytes = snprintf(buf, size,
+ "rid=0x%llx addr=0x%llx %c%c%c%c%c pasid=0x%llx index=0x%llx",
+ FIELD_GET(GENMASK_ULL(31, 16), dw0),
+ FIELD_GET(GENMASK_ULL(63, 12), dw1),
+ dw1 & BIT_ULL(0) ? 'r' : '-',
+ dw1 & BIT_ULL(1) ? 'w' : '-',
+ dw0 & BIT_ULL(52) ? 'x' : '-',
+ dw0 & BIT_ULL(53) ? 'p' : '-',
+ dw1 & BIT_ULL(2) ? 'l' : '-',
+ FIELD_GET(GENMASK_ULL(51, 32), dw0),
+ FIELD_GET(GENMASK_ULL(11, 3), dw1));
+
+ /* Private Data */
+ if (dw0 & BIT_ULL(9)) {
+ size -= bytes;
+ buf += bytes;
+ snprintf(buf, size, " private=0x%llx/0x%llx\n", dw2, dw3);
+ }
+
+ return str;
+}
+
#endif
diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h
index d233f2916584..e5c1ca6d16ee 100644
--- a/include/trace/events/intel_iommu.h
+++ b/include/trace/events/intel_iommu.h
@@ -15,6 +15,8 @@
#include <linux/tracepoint.h>
#include <linux/intel-iommu.h>
+#define MSG_MAX 256
+
TRACE_EVENT(qi_submit,
TP_PROTO(struct intel_iommu *iommu, u64 qw0, u64 qw1, u64 qw2, u64 qw3),
@@ -51,6 +53,41 @@ TRACE_EVENT(qi_submit,
__entry->qw0, __entry->qw1, __entry->qw2, __entry->qw3
)
);
+
+TRACE_EVENT(prq_report,
+ TP_PROTO(struct intel_iommu *iommu, struct device *dev,
+ u64 dw0, u64 dw1, u64 dw2, u64 dw3,
+ unsigned long seq),
+
+ TP_ARGS(iommu, dev, dw0, dw1, dw2, dw3, seq),
+
+ TP_STRUCT__entry(
+ __field(u64, dw0)
+ __field(u64, dw1)
+ __field(u64, dw2)
+ __field(u64, dw3)
+ __field(unsigned long, seq)
+ __string(iommu, iommu->name)
+ __string(dev, dev_name(dev))
+ __dynamic_array(char, buff, MSG_MAX)
+ ),
+
+ TP_fast_assign(
+ __entry->dw0 = dw0;
+ __entry->dw1 = dw1;
+ __entry->dw2 = dw2;
+ __entry->dw3 = dw3;
+ __entry->seq = seq;
+ __assign_str(iommu, iommu->name);
+ __assign_str(dev, dev_name(dev));
+ ),
+
+ TP_printk("%s/%s seq# %ld: %s",
+ __get_str(iommu), __get_str(dev), __entry->seq,
+ decode_prq_descriptor(__get_str(buff), MSG_MAX, __entry->dw0,
+ __entry->dw1, __entry->dw2, __entry->dw3)
+ )
+);
#endif /* _TRACE_INTEL_IOMMU_H */
/* This part must be outside protection */
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index ade157b64ce7..d3d028c6a727 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -21,6 +21,7 @@
#include <linux/ioasid.h>
#include <asm/page.h>
#include <asm/fpu/api.h>
+#include <trace/events/intel_iommu.h>
#include "pasid.h"
#include "../iommu-sva-lib.h"
@@ -976,12 +977,18 @@ static irqreturn_t prq_event_thread(int irq, void *d)
goto bad_req;
}
+ sdev->prq_seq_number++;
+
/*
* If prq is to be handled outside iommu driver via receiver of
* the fault notifiers, we skip the page response here.
*/
if (intel_svm_prq_report(sdev->dev, req))
handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
+
+ trace_prq_report(iommu, sdev->dev, req->qw_0, req->qw_1,
+ req->priv_data[0], req->priv_data[1],
+ sdev->prq_seq_number);
prq_advance:
head = (head + sizeof(*req)) & PRQ_RING_MASK;
}
--
2.25.1
The execution time for page fault request handling is performance critical
and needs to be monitored. This adds code to sample the execution time of
page fault request handling.
Signed-off-by: Lu Baolu <[email protected]>
---
drivers/iommu/intel/svm.c | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index d3d028c6a727..6bff9a7f9133 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -24,6 +24,7 @@
#include <trace/events/intel_iommu.h>
#include "pasid.h"
+#include "perf.h"
#include "../iommu-sva-lib.h"
static irqreturn_t prq_event_thread(int irq, void *d);
@@ -838,8 +839,8 @@ static int prq_to_iommu_prot(struct page_req_dsc *req)
return prot;
}
-static int
-intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
+static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
+ struct page_req_dsc *desc)
{
struct iommu_fault_event event;
@@ -871,6 +872,12 @@ intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
memcpy(event.fault.prm.private_data, desc->priv_data,
sizeof(desc->priv_data));
+ } else if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ)) {
+ /*
+ * If the private data fields are not used by hardware, use it
+ * to monitor the prq handle latency.
+ */
+ event.fault.prm.private_data[0] = ktime_to_ns(ktime_get());
}
return iommu_report_device_fault(dev, &event);
@@ -983,7 +990,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
* If prq is to be handled outside iommu driver via receiver of
* the fault notifiers, we skip the page response here.
*/
- if (intel_svm_prq_report(sdev->dev, req))
+ if (intel_svm_prq_report(iommu, sdev->dev, req))
handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
trace_prq_report(iommu, sdev->dev, req->qw_0, req->qw_1,
@@ -1172,6 +1179,9 @@ int intel_svm_page_response(struct device *dev,
if (private_present)
memcpy(&desc.qw2, prm->private_data,
sizeof(prm->private_data));
+ else if (prm->private_data[0])
+ dmar_latency_update(iommu, DMAR_LATENCY_PRQ,
+ ktime_to_ns(ktime_get()) - prm->private_data[0]);
qi_submit_sync(iommu, &desc, 1, 0);
}
--
2.25.1
Queued invalidation execution time is performance critical and needs
to be monitored. This adds code to sample the execution time of IOTLB/
devTLB/ICE cache invalidation.
Signed-off-by: Lu Baolu <[email protected]>
---
drivers/iommu/intel/dmar.c | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 1e31e6799d5c..59ea07d5d70a 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -34,6 +34,7 @@
#include <trace/events/intel_iommu.h>
#include "../irq_remapping.h"
+#include "perf.h"
typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *);
struct dmar_res_callback {
@@ -1340,15 +1341,33 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
unsigned int count, unsigned long options)
{
struct q_inval *qi = iommu->qi;
+ s64 devtlb_start_ktime = 0;
+ s64 iotlb_start_ktime = 0;
+ s64 iec_start_ktime = 0;
struct qi_desc wait_desc;
int wait_index, index;
unsigned long flags;
int offset, shift;
int rc, i;
+ u64 type;
if (!qi)
return 0;
+ type = desc->qw0 & GENMASK_ULL(3, 0);
+
+ if ((type == QI_IOTLB_TYPE || type == QI_EIOTLB_TYPE) &&
+ dmar_latency_enabled(iommu, DMAR_LATENCY_INV_IOTLB))
+ iotlb_start_ktime = ktime_to_ns(ktime_get());
+
+ if ((type == QI_DIOTLB_TYPE || type == QI_DEIOTLB_TYPE) &&
+ dmar_latency_enabled(iommu, DMAR_LATENCY_INV_DEVTLB))
+ devtlb_start_ktime = ktime_to_ns(ktime_get());
+
+ if (type == QI_IEC_TYPE &&
+ dmar_latency_enabled(iommu, DMAR_LATENCY_INV_IEC))
+ iec_start_ktime = ktime_to_ns(ktime_get());
+
restart:
rc = 0;
@@ -1423,6 +1442,18 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
if (rc == -EAGAIN)
goto restart;
+ if (iotlb_start_ktime)
+ dmar_latency_update(iommu, DMAR_LATENCY_INV_IOTLB,
+ ktime_to_ns(ktime_get()) - iotlb_start_ktime);
+
+ if (devtlb_start_ktime)
+ dmar_latency_update(iommu, DMAR_LATENCY_INV_DEVTLB,
+ ktime_to_ns(ktime_get()) - devtlb_start_ktime);
+
+ if (iec_start_ktime)
+ dmar_latency_update(iommu, DMAR_LATENCY_INV_IEC,
+ ktime_to_ns(ktime_get()) - iec_start_ktime);
+
return rc;
}
--
2.25.1
Hi BaoLu,
On Thu, 20 May 2021 11:15:21 +0800, Lu Baolu <[email protected]>
wrote:
> We are about to use iommu_sva_alloc/free_pasid() helpers in iommu core.
> That means the pasid life cycle will be managed by iommu core. Use a
> local array to save the per pasid private data instead of attaching it
> the real pasid.
>
I feel a little awkward to have a separate xarray for storing per IOASID
data. Seems duplicated.
Jason suggested in another thread that we can make ioasid_data public
and embeded in struct intel_svm, then we can get rid of the private data
pointer. ioasid_find will return the ioasid_data, then we can retrieve the
private data with container_of.
roughly,
struct intel_svm {
...
struct ioasid_data;
};
struct ioasid_data {
ioasid_t id;
refcount_t refs;
struct mm_struct *mm;
};
This can be a separate patch/effort if it make sense to you.
> Signed-off-by: Lu Baolu <[email protected]>
> ---
> drivers/iommu/intel/svm.c | 62 ++++++++++++++++++++++++++-------------
> 1 file changed, 41 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
> index 5165cea90421..82b0627ad7e7 100644
> --- a/drivers/iommu/intel/svm.c
> +++ b/drivers/iommu/intel/svm.c
> @@ -17,6 +17,7 @@
> #include <linux/dmar.h>
> #include <linux/interrupt.h>
> #include <linux/mm_types.h>
> +#include <linux/xarray.h>
> #include <linux/ioasid.h>
> #include <asm/page.h>
> #include <asm/fpu/api.h>
> @@ -28,6 +29,23 @@ static void intel_svm_drain_prq(struct device *dev,
> u32 pasid);
> #define PRQ_ORDER 0
>
> +static DEFINE_XARRAY_ALLOC(pasid_private_array);
> +static int pasid_private_add(ioasid_t pasid, void *priv)
> +{
> + return xa_alloc(&pasid_private_array, &pasid, priv,
> + XA_LIMIT(pasid, pasid), GFP_ATOMIC);
> +}
> +
> +static void pasid_private_remove(ioasid_t pasid)
> +{
> + xa_erase(&pasid_private_array, pasid);
> +}
> +
> +static void *pasid_private_find(ioasid_t pasid)
> +{
> + return xa_load(&pasid_private_array, pasid);
> +}
> +
> int intel_svm_enable_prq(struct intel_iommu *iommu)
> {
> struct page *pages;
> @@ -224,7 +242,7 @@ static int pasid_to_svm_sdev(struct device *dev,
> unsigned int pasid, if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
> return -EINVAL;
>
> - svm = ioasid_find(NULL, pasid, NULL);
> + svm = pasid_private_find(pasid);
> if (IS_ERR(svm))
> return PTR_ERR(svm);
>
> @@ -334,7 +352,7 @@ int intel_svm_bind_gpasid(struct iommu_domain
> *domain, struct device *dev, svm->gpasid = data->gpasid;
> svm->flags |= SVM_FLAG_GUEST_PASID;
> }
> - ioasid_set_data(data->hpasid, svm);
> + pasid_private_add(data->hpasid, svm);
> INIT_LIST_HEAD_RCU(&svm->devs);
> mmput(svm->mm);
> }
> @@ -388,7 +406,7 @@ int intel_svm_bind_gpasid(struct iommu_domain
> *domain, struct device *dev, list_add_rcu(&sdev->list, &svm->devs);
> out:
> if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
> - ioasid_set_data(data->hpasid, NULL);
> + pasid_private_remove(data->hpasid);
> kfree(svm);
> }
>
> @@ -431,7 +449,7 @@ int intel_svm_unbind_gpasid(struct device *dev, u32
> pasid)
> * the unbind, IOMMU driver will get
> notified
> * and perform cleanup.
> */
> - ioasid_set_data(pasid, NULL);
> + pasid_private_remove(pasid);
> kfree(svm);
> }
> }
> @@ -547,8 +565,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int
> flags, svm = kzalloc(sizeof(*svm), GFP_KERNEL);
> if (!svm) {
> ret = -ENOMEM;
> - kfree(sdev);
> - goto out;
> + goto sdev_err;
> }
>
> if (pasid_max > intel_pasid_max_id)
> @@ -556,13 +573,16 @@ intel_svm_bind_mm(struct device *dev, unsigned int
> flags,
> /* Do not use PASID 0, reserved for RID to PASID */
> svm->pasid = ioasid_alloc(NULL, PASID_MIN,
> - pasid_max - 1, svm);
> + pasid_max - 1, NULL);
> if (svm->pasid == INVALID_IOASID) {
> - kfree(svm);
> - kfree(sdev);
> ret = -ENOSPC;
> - goto out;
> + goto svm_err;
> }
> +
> + ret = pasid_private_add(svm->pasid, svm);
> + if (ret)
> + goto pasid_err;
> +
> svm->notifier.ops = &intel_mmuops;
> svm->mm = mm;
> svm->flags = flags;
> @@ -571,12 +591,8 @@ intel_svm_bind_mm(struct device *dev, unsigned int
> flags, ret = -ENOMEM;
> if (mm) {
> ret = mmu_notifier_register(&svm->notifier, mm);
> - if (ret) {
> - ioasid_put(svm->pasid);
> - kfree(svm);
> - kfree(sdev);
> - goto out;
> - }
> + if (ret)
> + goto priv_err;
> }
>
> spin_lock_irqsave(&iommu->lock, iflags);
> @@ -590,8 +606,13 @@ intel_svm_bind_mm(struct device *dev, unsigned int
> flags, if (ret) {
> if (mm)
> mmu_notifier_unregister(&svm->notifier,
> mm); +priv_err:
> + pasid_private_remove(svm->pasid);
> +pasid_err:
> ioasid_put(svm->pasid);
> +svm_err:
> kfree(svm);
> +sdev_err:
> kfree(sdev);
> goto out;
> }
> @@ -614,10 +635,8 @@ intel_svm_bind_mm(struct device *dev, unsigned int
> flags, (cpu_feature_enabled(X86_FEATURE_LA57) ?
> PASID_FLAG_FL5LP : 0));
> spin_unlock_irqrestore(&iommu->lock, iflags);
> - if (ret) {
> - kfree(sdev);
> - goto out;
> - }
> + if (ret)
> + goto sdev_err;
> }
> list_add_rcu(&sdev->list, &svm->devs);
> success:
> @@ -670,6 +689,7 @@ static int intel_svm_unbind_mm(struct device *dev,
> u32 pasid) load_pasid(svm->mm, PASID_DISABLED);
> }
> list_del(&svm->list);
> + pasid_private_remove(svm->pasid);
> /* We mandate that no page faults may be
> outstanding
> * for the PASID when
> intel_svm_unbind_mm() is called.
> * If that is not obeyed, subtle errors
> will happen. @@ -924,7 +944,7 @@ static irqreturn_t prq_event_thread(int
> irq, void *d) }
> if (!svm || svm->pasid != req->pasid) {
> rcu_read_lock();
> - svm = ioasid_find(NULL, req->pasid, NULL);
> + svm = pasid_private_find(req->pasid);
> /* It *can't* go away, because the driver is not
> permitted
> * to unbind the mm while any page faults are
> outstanding.
> * So we only need RCU to protect the internal
> idr code. */
Thanks,
Jacob
Hi Jacob,
Thanks for reviewing my patch.
On 5/22/21 5:25 AM, Jacob Pan wrote:
> Hi BaoLu,
>
> On Thu, 20 May 2021 11:15:21 +0800, Lu Baolu <[email protected]>
> wrote:
>
>> We are about to use iommu_sva_alloc/free_pasid() helpers in iommu core.
>> That means the pasid life cycle will be managed by iommu core. Use a
>> local array to save the per pasid private data instead of attaching it
>> the real pasid.
>>
> I feel a little awkward to have a separate xarray for storing per IOASID
> data. Seems duplicated.
> Jason suggested in another thread that we can make ioasid_data public
> and embeded in struct intel_svm, then we can get rid of the private data
> pointer. ioasid_find will return the ioasid_data, then we can retrieve the
> private data with container_of.
The problem that this patch wants to solve is that the
iommu_sva_alloc_pasid() will attach the mm pointer to the sva pasid.
pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
Assuming that each sva pasid can have only a single private data
pointer, the vendor iommu driver shouldn't set the private data again.
>
> roughly,
>
> struct intel_svm {
> ...
> struct ioasid_data;
> };
>
> struct ioasid_data {
> ioasid_t id;
> refcount_t refs;
> struct mm_struct *mm;
> };
>
> This can be a separate patch/effort if it make sense to you.
Yes if we have a better solution.
Best regards,
baolu
Hi Lu,
On Mon, 24 May 2021 10:16:18 +0800, Lu Baolu <[email protected]>
wrote:
> Hi Jacob,
>
> Thanks for reviewing my patch.
>
> On 5/22/21 5:25 AM, Jacob Pan wrote:
> > Hi BaoLu,
> >
> > On Thu, 20 May 2021 11:15:21 +0800, Lu Baolu <[email protected]>
> > wrote:
> >
> >> We are about to use iommu_sva_alloc/free_pasid() helpers in iommu core.
> >> That means the pasid life cycle will be managed by iommu core. Use a
> >> local array to save the per pasid private data instead of attaching it
> >> the real pasid.
> >>
> > I feel a little awkward to have a separate xarray for storing per IOASID
> > data. Seems duplicated.
> > Jason suggested in another thread that we can make ioasid_data public
> > and embeded in struct intel_svm, then we can get rid of the private data
> > pointer. ioasid_find will return the ioasid_data, then we can retrieve
> > the private data with container_of.
>
> The problem that this patch wants to solve is that the
> iommu_sva_alloc_pasid() will attach the mm pointer to the sva pasid.
>
> pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
>
> Assuming that each sva pasid can have only a single private data
> pointer, the vendor iommu driver shouldn't set the private data again.
>
You are right. I got confused with vSVM, the guest will have the private
data assigned after the bind.
> >
> > roughly,
> >
> > struct intel_svm {
> > ...
> > struct ioasid_data;
> > };
> >
> > struct ioasid_data {
> > ioasid_t id;
> > refcount_t refs;
> > struct mm_struct *mm;
> > };
> >
> > This can be a separate patch/effort if it make sense to you.
>
> Yes if we have a better solution.
>
Will be part of the IOASID core change.
Thanks,
> Best regards,
> baolu
Thanks,
Jacob