PASIDs are process wide. It was attempted to use refcounted PASIDs to
free them when the last thread drops the refcount. This turned out to
be complex and error prone. Given the fact that the PASID space is 20
bits, which allows up to 1M processes to have a PASID associated
concurrently, PASID resource exhaustion is not a realistic concern.
Therefore it was decided to simplify the approach and stick with lazy
on demand PASID allocation, but drop the eager free approach and make
a allocated PASID lifetime bound to the life time of the process.
Get rid of the refcounting mechanisms and replace/rename the interfaces
to reflect this new approach.
Suggested-by: Dave Hansen <[email protected]>
Signed-off-by: Fenghua Yu <[email protected]>
Reviewed-by: Tony Luck <[email protected]>
---
v4:
- Update the commit message (Thomas).
v3:
- Rename mm_pasid_get() to mm_pasid_set() (Thomas).
- Remove ioasid_get() because it's not used any more when the IOASID
is freed on mm exit (Thomas).
- Remove PASID's refcount exercise in ioasid_put() and rename
ioasid_put() to ioasid_free() (Thomas).
v2:
- Free PASID on mm exit instead of in exit(2) or unbind() (Thomas, AndyL,
PeterZ)
- Add mm_pasid_init(), mm_pasid_get(), and mm_pasid_drop() functions in mm.
So the mm's PASID operations are generic for both X86 and ARM
(Dave Hansen)
.../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 5 +--
drivers/iommu/intel/iommu.c | 4 +-
drivers/iommu/intel/svm.c | 9 -----
drivers/iommu/ioasid.c | 39 ++-----------------
drivers/iommu/iommu-sva-lib.c | 39 ++++++-------------
drivers/iommu/iommu-sva-lib.h | 1 -
include/linux/ioasid.h | 12 +-----
include/linux/sched/mm.h | 16 ++++++++
kernel/fork.c | 1 +
9 files changed, 38 insertions(+), 88 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index a737ba5f727e..22ddd05bbdcd 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -340,14 +340,12 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm)
bond->smmu_mn = arm_smmu_mmu_notifier_get(smmu_domain, mm);
if (IS_ERR(bond->smmu_mn)) {
ret = PTR_ERR(bond->smmu_mn);
- goto err_free_pasid;
+ goto err_free_bond;
}
list_add(&bond->list, &master->bonds);
return &bond->sva;
-err_free_pasid:
- iommu_sva_free_pasid(mm);
err_free_bond:
kfree(bond);
return ERR_PTR(ret);
@@ -377,7 +375,6 @@ void arm_smmu_sva_unbind(struct iommu_sva *handle)
if (refcount_dec_and_test(&bond->refs)) {
list_del(&bond->list);
arm_smmu_mmu_notifier_put(bond->smmu_mn);
- iommu_sva_free_pasid(bond->mm);
kfree(bond);
}
mutex_unlock(&sva_lock);
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 92fea3fbbb11..ef03b2176bbd 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4781,7 +4781,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
link_failed:
spin_unlock_irqrestore(&device_domain_lock, flags);
if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
- ioasid_put(domain->default_pasid);
+ ioasid_free(domain->default_pasid);
return ret;
}
@@ -4811,7 +4811,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain,
spin_unlock_irqrestore(&device_domain_lock, flags);
if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
- ioasid_put(domain->default_pasid);
+ ioasid_free(domain->default_pasid);
}
static int prepare_domain_attach_device(struct iommu_domain *domain,
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 5b5d69b04fcc..51ac2096b3da 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -514,11 +514,6 @@ static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm,
return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1);
}
-static void intel_svm_free_pasid(struct mm_struct *mm)
-{
- iommu_sva_free_pasid(mm);
-}
-
static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
struct device *dev,
struct mm_struct *mm,
@@ -662,8 +657,6 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
kfree(svm);
}
}
- /* Drop a PASID reference and free it if no reference. */
- intel_svm_free_pasid(mm);
}
out:
return ret;
@@ -1047,8 +1040,6 @@ struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void
}
sva = intel_svm_bind_mm(iommu, dev, mm, flags);
- if (IS_ERR_OR_NULL(sva))
- intel_svm_free_pasid(mm);
mutex_unlock(&pasid_mutex);
return sva;
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 06fee7416816..a786c034907c 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -2,7 +2,7 @@
/*
* I/O Address Space ID allocator. There is one global IOASID space, split into
* subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and
- * free IOASIDs with ioasid_alloc and ioasid_put.
+ * free IOASIDs with ioasid_alloc() and ioasid_free().
*/
#include <linux/ioasid.h>
#include <linux/module.h>
@@ -15,7 +15,6 @@ struct ioasid_data {
struct ioasid_set *set;
void *private;
struct rcu_head rcu;
- refcount_t refs;
};
/*
@@ -315,7 +314,6 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
data->set = set;
data->private = private;
- refcount_set(&data->refs, 1);
/*
* Custom allocator needs allocator data to perform platform specific
@@ -348,35 +346,11 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
EXPORT_SYMBOL_GPL(ioasid_alloc);
/**
- * ioasid_get - obtain a reference to the IOASID
- * @ioasid: the ID to get
- */
-void ioasid_get(ioasid_t ioasid)
-{
- struct ioasid_data *ioasid_data;
-
- spin_lock(&ioasid_allocator_lock);
- ioasid_data = xa_load(&active_allocator->xa, ioasid);
- if (ioasid_data)
- refcount_inc(&ioasid_data->refs);
- else
- WARN_ON(1);
- spin_unlock(&ioasid_allocator_lock);
-}
-EXPORT_SYMBOL_GPL(ioasid_get);
-
-/**
- * ioasid_put - Release a reference to an ioasid
+ * ioasid_free - Free an ioasid
* @ioasid: the ID to remove
- *
- * Put a reference to the IOASID, free it when the number of references drops to
- * zero.
- *
- * Return: %true if the IOASID was freed, %false otherwise.
*/
-bool ioasid_put(ioasid_t ioasid)
+void ioasid_free(ioasid_t ioasid)
{
- bool free = false;
struct ioasid_data *ioasid_data;
spin_lock(&ioasid_allocator_lock);
@@ -386,10 +360,6 @@ bool ioasid_put(ioasid_t ioasid)
goto exit_unlock;
}
- free = refcount_dec_and_test(&ioasid_data->refs);
- if (!free)
- goto exit_unlock;
-
active_allocator->ops->free(ioasid, active_allocator->ops->pdata);
/* Custom allocator needs additional steps to free the xa element */
if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
@@ -399,9 +369,8 @@ bool ioasid_put(ioasid_t ioasid)
exit_unlock:
spin_unlock(&ioasid_allocator_lock);
- return free;
}
-EXPORT_SYMBOL_GPL(ioasid_put);
+EXPORT_SYMBOL_GPL(ioasid_free);
/**
* ioasid_find - Find IOASID data
diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c
index bd41405d34e9..106506143896 100644
--- a/drivers/iommu/iommu-sva-lib.c
+++ b/drivers/iommu/iommu-sva-lib.c
@@ -18,8 +18,7 @@ static DECLARE_IOASID_SET(iommu_sva_pasid);
*
* Try to allocate a PASID for this mm, or take a reference to the existing one
* provided it fits within the [@min, @max] range. On success the PASID is
- * available in mm->pasid, and must be released with iommu_sva_free_pasid().
- * @min must be greater than 0, because 0 indicates an unused mm->pasid.
+ * available in mm->pasid and will be available for the lifetime of the mm.
*
* Returns 0 on success and < 0 on error.
*/
@@ -33,38 +32,24 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max)
return -EINVAL;
mutex_lock(&iommu_sva_lock);
- if (mm->pasid) {
- if (mm->pasid >= min && mm->pasid <= max)
- ioasid_get(mm->pasid);
- else
+ /* Is a PASID already associated with this mm? */
+ if (pasid_valid(mm->pasid)) {
+ if (mm->pasid < min || mm->pasid >= max)
ret = -EOVERFLOW;
- } else {
- pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
- if (pasid == INVALID_IOASID)
- ret = -ENOMEM;
- else
- mm->pasid = pasid;
+ goto out;
}
+
+ pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
+ if (!pasid_valid(pasid))
+ ret = -ENOMEM;
+ else
+ mm_pasid_set(mm, pasid);
+out:
mutex_unlock(&iommu_sva_lock);
return ret;
}
EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid);
-/**
- * iommu_sva_free_pasid - Release the mm's PASID
- * @mm: the mm
- *
- * Drop one reference to a PASID allocated with iommu_sva_alloc_pasid()
- */
-void iommu_sva_free_pasid(struct mm_struct *mm)
-{
- mutex_lock(&iommu_sva_lock);
- if (ioasid_put(mm->pasid))
- mm->pasid = 0;
- mutex_unlock(&iommu_sva_lock);
-}
-EXPORT_SYMBOL_GPL(iommu_sva_free_pasid);
-
/* ioasid_find getter() requires a void * argument */
static bool __mmget_not_zero(void *mm)
{
diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva-lib.h
index 95dc3ebc1928..8909ea1094e3 100644
--- a/drivers/iommu/iommu-sva-lib.h
+++ b/drivers/iommu/iommu-sva-lib.h
@@ -9,7 +9,6 @@
#include <linux/mm_types.h>
int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max);
-void iommu_sva_free_pasid(struct mm_struct *mm);
struct mm_struct *iommu_sva_find(ioasid_t pasid);
/* I/O Page fault */
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 2237f64dbaae..af1c9d62e642 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -34,8 +34,7 @@ struct ioasid_allocator_ops {
#if IS_ENABLED(CONFIG_IOASID)
ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
void *private);
-void ioasid_get(ioasid_t ioasid);
-bool ioasid_put(ioasid_t ioasid);
+void ioasid_free(ioasid_t ioasid);
void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
bool (*getter)(void *));
int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
@@ -53,14 +52,7 @@ static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
return INVALID_IOASID;
}
-static inline void ioasid_get(ioasid_t ioasid)
-{
-}
-
-static inline bool ioasid_put(ioasid_t ioasid)
-{
- return false;
-}
+static inline void ioasid_free(ioasid_t ioasid) { }
static inline void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
bool (*getter)(void *))
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index c74d1edbac2f..a80356e9dc69 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -439,8 +439,24 @@ static inline void mm_pasid_init(struct mm_struct *mm)
{
mm->pasid = INVALID_IOASID;
}
+
+/* Associate a PASID with an mm_struct: */
+static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid)
+{
+ mm->pasid = pasid;
+}
+
+static inline void mm_pasid_drop(struct mm_struct *mm)
+{
+ if (pasid_valid(mm->pasid)) {
+ ioasid_free(mm->pasid);
+ mm->pasid = INVALID_IOASID;
+ }
+}
#else
static inline void mm_pasid_init(struct mm_struct *mm) {}
+static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) {}
+static inline void mm_pasid_drop(struct mm_struct *mm) {}
#endif
#endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index deacd2c17a7f..c03c6682464c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1115,6 +1115,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ mm_pasid_drop(mm);
mmdrop(mm);
}
--
2.35.1
On 2/8/22 7:02 AM, Fenghua Yu wrote:
> PASIDs are process wide. It was attempted to use refcounted PASIDs to
> free them when the last thread drops the refcount. This turned out to
> be complex and error prone. Given the fact that the PASID space is 20
> bits, which allows up to 1M processes to have a PASID associated
> concurrently, PASID resource exhaustion is not a realistic concern.
>
> Therefore it was decided to simplify the approach and stick with lazy
> on demand PASID allocation, but drop the eager free approach and make
> a allocated PASID lifetime bound to the life time of the process.
>
> Get rid of the refcounting mechanisms and replace/rename the interfaces
> to reflect this new approach.
>
> Suggested-by: Dave Hansen <[email protected]>
> Signed-off-by: Fenghua Yu <[email protected]>
> Reviewed-by: Tony Luck <[email protected]>
> ---
> v4:
> - Update the commit message (Thomas).
>
> v3:
> - Rename mm_pasid_get() to mm_pasid_set() (Thomas).
> - Remove ioasid_get() because it's not used any more when the IOASID
> is freed on mm exit (Thomas).
> - Remove PASID's refcount exercise in ioasid_put() and rename
> ioasid_put() to ioasid_free() (Thomas).
>
> v2:
> - Free PASID on mm exit instead of in exit(2) or unbind() (Thomas, AndyL,
> PeterZ)
> - Add mm_pasid_init(), mm_pasid_get(), and mm_pasid_drop() functions in mm.
> So the mm's PASID operations are generic for both X86 and ARM
> (Dave Hansen)
>
> .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 5 +--
> drivers/iommu/intel/iommu.c | 4 +-
> drivers/iommu/intel/svm.c | 9 -----
> drivers/iommu/ioasid.c | 39 ++-----------------
> drivers/iommu/iommu-sva-lib.c | 39 ++++++-------------
> drivers/iommu/iommu-sva-lib.h | 1 -
> include/linux/ioasid.h | 12 +-----
> include/linux/sched/mm.h | 16 ++++++++
> kernel/fork.c | 1 +
> 9 files changed, 38 insertions(+), 88 deletions(-)
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> index a737ba5f727e..22ddd05bbdcd 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> @@ -340,14 +340,12 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm)
> bond->smmu_mn = arm_smmu_mmu_notifier_get(smmu_domain, mm);
> if (IS_ERR(bond->smmu_mn)) {
> ret = PTR_ERR(bond->smmu_mn);
> - goto err_free_pasid;
> + goto err_free_bond;
> }
>
> list_add(&bond->list, &master->bonds);
> return &bond->sva;
>
> -err_free_pasid:
> - iommu_sva_free_pasid(mm);
> err_free_bond:
> kfree(bond);
> return ERR_PTR(ret);
> @@ -377,7 +375,6 @@ void arm_smmu_sva_unbind(struct iommu_sva *handle)
> if (refcount_dec_and_test(&bond->refs)) {
> list_del(&bond->list);
> arm_smmu_mmu_notifier_put(bond->smmu_mn);
> - iommu_sva_free_pasid(bond->mm);
> kfree(bond);
> }
> mutex_unlock(&sva_lock);
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index 92fea3fbbb11..ef03b2176bbd 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -4781,7 +4781,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
> link_failed:
> spin_unlock_irqrestore(&device_domain_lock, flags);
> if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
> - ioasid_put(domain->default_pasid);
> + ioasid_free(domain->default_pasid);
>
> return ret;
> }
> @@ -4811,7 +4811,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain,
> spin_unlock_irqrestore(&device_domain_lock, flags);
>
> if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
> - ioasid_put(domain->default_pasid);
> + ioasid_free(domain->default_pasid);
> }
>
> static int prepare_domain_attach_device(struct iommu_domain *domain,
> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
> index 5b5d69b04fcc..51ac2096b3da 100644
> --- a/drivers/iommu/intel/svm.c
> +++ b/drivers/iommu/intel/svm.c
> @@ -514,11 +514,6 @@ static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm,
> return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1);
> }
>
> -static void intel_svm_free_pasid(struct mm_struct *mm)
> -{
> - iommu_sva_free_pasid(mm);
> -}
> -
> static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
> struct device *dev,
> struct mm_struct *mm,
> @@ -662,8 +657,6 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
> kfree(svm);
> }
> }
> - /* Drop a PASID reference and free it if no reference. */
> - intel_svm_free_pasid(mm);
> }
> out:
> return ret;
> @@ -1047,8 +1040,6 @@ struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void
> }
>
> sva = intel_svm_bind_mm(iommu, dev, mm, flags);
> - if (IS_ERR_OR_NULL(sva))
> - intel_svm_free_pasid(mm);
> mutex_unlock(&pasid_mutex);
>
> return sva;
> diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
> index 06fee7416816..a786c034907c 100644
> --- a/drivers/iommu/ioasid.c
> +++ b/drivers/iommu/ioasid.c
> @@ -2,7 +2,7 @@
> /*
> * I/O Address Space ID allocator. There is one global IOASID space, split into
> * subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and
> - * free IOASIDs with ioasid_alloc and ioasid_put.
> + * free IOASIDs with ioasid_alloc() and ioasid_free().
> */
> #include <linux/ioasid.h>
> #include <linux/module.h>
> @@ -15,7 +15,6 @@ struct ioasid_data {
> struct ioasid_set *set;
> void *private;
> struct rcu_head rcu;
> - refcount_t refs;
> };
>
> /*
> @@ -315,7 +314,6 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
>
> data->set = set;
> data->private = private;
> - refcount_set(&data->refs, 1);
>
> /*
> * Custom allocator needs allocator data to perform platform specific
> @@ -348,35 +346,11 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
> EXPORT_SYMBOL_GPL(ioasid_alloc);
>
> /**
> - * ioasid_get - obtain a reference to the IOASID
> - * @ioasid: the ID to get
> - */
> -void ioasid_get(ioasid_t ioasid)
> -{
> - struct ioasid_data *ioasid_data;
> -
> - spin_lock(&ioasid_allocator_lock);
> - ioasid_data = xa_load(&active_allocator->xa, ioasid);
> - if (ioasid_data)
> - refcount_inc(&ioasid_data->refs);
> - else
> - WARN_ON(1);
> - spin_unlock(&ioasid_allocator_lock);
> -}
> -EXPORT_SYMBOL_GPL(ioasid_get);
> -
> -/**
> - * ioasid_put - Release a reference to an ioasid
> + * ioasid_free - Free an ioasid
> * @ioasid: the ID to remove
> - *
> - * Put a reference to the IOASID, free it when the number of references drops to
> - * zero.
> - *
> - * Return: %true if the IOASID was freed, %false otherwise.
> */
> -bool ioasid_put(ioasid_t ioasid)
> +void ioasid_free(ioasid_t ioasid)
> {
> - bool free = false;
> struct ioasid_data *ioasid_data;
>
> spin_lock(&ioasid_allocator_lock);
> @@ -386,10 +360,6 @@ bool ioasid_put(ioasid_t ioasid)
> goto exit_unlock;
> }
>
> - free = refcount_dec_and_test(&ioasid_data->refs);
> - if (!free)
> - goto exit_unlock;
> -
> active_allocator->ops->free(ioasid, active_allocator->ops->pdata);
> /* Custom allocator needs additional steps to free the xa element */
> if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
> @@ -399,9 +369,8 @@ bool ioasid_put(ioasid_t ioasid)
>
> exit_unlock:
> spin_unlock(&ioasid_allocator_lock);
> - return free;
> }
> -EXPORT_SYMBOL_GPL(ioasid_put);
> +EXPORT_SYMBOL_GPL(ioasid_free);
>
> /**
> * ioasid_find - Find IOASID data
> diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c
> index bd41405d34e9..106506143896 100644
> --- a/drivers/iommu/iommu-sva-lib.c
> +++ b/drivers/iommu/iommu-sva-lib.c
> @@ -18,8 +18,7 @@ static DECLARE_IOASID_SET(iommu_sva_pasid);
> *
> * Try to allocate a PASID for this mm, or take a reference to the existing one
> * provided it fits within the [@min, @max] range. On success the PASID is
> - * available in mm->pasid, and must be released with iommu_sva_free_pasid().
> - * @min must be greater than 0, because 0 indicates an unused mm->pasid.
> + * available in mm->pasid and will be available for the lifetime of the mm.
> *
> * Returns 0 on success and < 0 on error.
> */
> @@ -33,38 +32,24 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max)
> return -EINVAL;
>
> mutex_lock(&iommu_sva_lock);
> - if (mm->pasid) {
> - if (mm->pasid >= min && mm->pasid <= max)
> - ioasid_get(mm->pasid);
> - else
> + /* Is a PASID already associated with this mm? */
> + if (pasid_valid(mm->pasid)) {
> + if (mm->pasid < min || mm->pasid >= max)
> ret = -EOVERFLOW;
> - } else {
> - pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
> - if (pasid == INVALID_IOASID)
> - ret = -ENOMEM;
> - else
> - mm->pasid = pasid;
> + goto out;
> }
> +
> + pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
> + if (!pasid_valid(pasid))
> + ret = -ENOMEM;
> + else
> + mm_pasid_set(mm, pasid);
> +out:
> mutex_unlock(&iommu_sva_lock);
> return ret;
> }
> EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid);
>
> -/**
> - * iommu_sva_free_pasid - Release the mm's PASID
> - * @mm: the mm
> - *
> - * Drop one reference to a PASID allocated with iommu_sva_alloc_pasid()
> - */
> -void iommu_sva_free_pasid(struct mm_struct *mm)
> -{
> - mutex_lock(&iommu_sva_lock);
> - if (ioasid_put(mm->pasid))
> - mm->pasid = 0;
> - mutex_unlock(&iommu_sva_lock);
> -}
> -EXPORT_SYMBOL_GPL(iommu_sva_free_pasid);
> -
> /* ioasid_find getter() requires a void * argument */
> static bool __mmget_not_zero(void *mm)
> {
> diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva-lib.h
> index 95dc3ebc1928..8909ea1094e3 100644
> --- a/drivers/iommu/iommu-sva-lib.h
> +++ b/drivers/iommu/iommu-sva-lib.h
> @@ -9,7 +9,6 @@
> #include <linux/mm_types.h>
>
> int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max);
> -void iommu_sva_free_pasid(struct mm_struct *mm);
> struct mm_struct *iommu_sva_find(ioasid_t pasid);
>
> /* I/O Page fault */
> diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
> index 2237f64dbaae..af1c9d62e642 100644
> --- a/include/linux/ioasid.h
> +++ b/include/linux/ioasid.h
> @@ -34,8 +34,7 @@ struct ioasid_allocator_ops {
> #if IS_ENABLED(CONFIG_IOASID)
> ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
> void *private);
> -void ioasid_get(ioasid_t ioasid);
> -bool ioasid_put(ioasid_t ioasid);
> +void ioasid_free(ioasid_t ioasid);
> void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
> bool (*getter)(void *));
> int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
> @@ -53,14 +52,7 @@ static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
> return INVALID_IOASID;
> }
>
> -static inline void ioasid_get(ioasid_t ioasid)
> -{
> -}
> -
> -static inline bool ioasid_put(ioasid_t ioasid)
> -{
> - return false;
> -}
> +static inline void ioasid_free(ioasid_t ioasid) { }
>
> static inline void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
> bool (*getter)(void *))
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index c74d1edbac2f..a80356e9dc69 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -439,8 +439,24 @@ static inline void mm_pasid_init(struct mm_struct *mm)
> {
> mm->pasid = INVALID_IOASID;
> }
> +
> +/* Associate a PASID with an mm_struct: */
> +static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid)
> +{
> + mm->pasid = pasid;
> +}
> +
> +static inline void mm_pasid_drop(struct mm_struct *mm)
> +{
> + if (pasid_valid(mm->pasid)) {
> + ioasid_free(mm->pasid);
> + mm->pasid = INVALID_IOASID;
> + }
> +}
> #else
> static inline void mm_pasid_init(struct mm_struct *mm) {}
> +static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) {}
> +static inline void mm_pasid_drop(struct mm_struct *mm) {}
> #endif
>
> #endif /* _LINUX_SCHED_MM_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index deacd2c17a7f..c03c6682464c 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1115,6 +1115,7 @@ static inline void __mmput(struct mm_struct *mm)
> }
> if (mm->binfmt)
> module_put(mm->binfmt->module);
> + mm_pasid_drop(mm);
> mmdrop(mm);
> }
>
Reviewed-by: Lu Baolu <[email protected]>
Best regards,
baolu
Hi Fenghua,
On Mon, 7 Feb 2022 15:02:48 -0800, Fenghua Yu <[email protected]> wrote:
> @@ -1047,8 +1040,6 @@ struct iommu_sva *intel_svm_bind(struct device
> *dev, struct mm_struct *mm, void }
>
> sva = intel_svm_bind_mm(iommu, dev, mm, flags);
> - if (IS_ERR_OR_NULL(sva))
> - intel_svm_free_pasid(mm);
If bind fails, the PASID has no IOMMU nor CPU context. It should be safe to
free here.
Thanks,
Jacob
On Thu, Feb 10, 2022 at 08:27:50AM -0800, Fenghua Yu wrote:
> Hi, Jacob,
>
> On Wed, Feb 09, 2022 at 07:16:14PM -0800, Jacob Pan wrote:
> > Hi Fenghua,
> >
> > On Mon, 7 Feb 2022 15:02:48 -0800, Fenghua Yu <[email protected]> wrote:
> >
> > > @@ -1047,8 +1040,6 @@ struct iommu_sva *intel_svm_bind(struct device
> > > *dev, struct mm_struct *mm, void }
> > >
> > > sva = intel_svm_bind_mm(iommu, dev, mm, flags);
> > > - if (IS_ERR_OR_NULL(sva))
> > > - intel_svm_free_pasid(mm);
> > If bind fails, the PASID has no IOMMU nor CPU context. It should be safe to
> > free here.
>
> The PASID can not be freed even if bind fails. The PASID allocated earlier
> (either in this thread or in another thread) might be populated to other
> threads already and being used now.
>
> Without freeing the PASID on bind failure, the worst case is the PASID might
> not be used in the process (and will be freed on process exit anyway).
>
> This all matches with the PASID life time described in the commit message.
But what does this mean for the user that failed that intel_svm_bind_mm()?
Here's a scenario:
Process sets up to use PASID capable device #1. Everything works,
so the process has mm->pasid, and the IOMMU has the tables to map
virtual addresses coming from device #1 using that PASID.
Now the same process asks to start using PASID capable device #2,
but there is a failure at intel_svm_bind_mm().
Fenghua is right that we shouldn't free the PASID. It is in use
by at least one thread of the process to access device #1.
But what happens with device #2? Does the caller of intel_svm_bind()
do the right thing with the IS_ERR_OR_NULL return value to let the
user know that device #2 isn't accessible?
-Tony
Hi, Tony,
On Thu, Feb 10, 2022 at 09:24:50AM -0800, Luck, Tony wrote:
> On Thu, Feb 10, 2022 at 08:27:50AM -0800, Fenghua Yu wrote:
> > Hi, Jacob,
> >
> > On Wed, Feb 09, 2022 at 07:16:14PM -0800, Jacob Pan wrote:
> > > Hi Fenghua,
> > >
> > > On Mon, 7 Feb 2022 15:02:48 -0800, Fenghua Yu <[email protected]> wrote:
> > >
> > > > @@ -1047,8 +1040,6 @@ struct iommu_sva *intel_svm_bind(struct device
> > > > *dev, struct mm_struct *mm, void }
> > > >
> > > > sva = intel_svm_bind_mm(iommu, dev, mm, flags);
> > > > - if (IS_ERR_OR_NULL(sva))
> > > > - intel_svm_free_pasid(mm);
> > > If bind fails, the PASID has no IOMMU nor CPU context. It should be safe to
> > > free here.
> >
> > The PASID can not be freed even if bind fails. The PASID allocated earlier
> > (either in this thread or in another thread) might be populated to other
> > threads already and being used now.
> >
> > Without freeing the PASID on bind failure, the worst case is the PASID might
> > not be used in the process (and will be freed on process exit anyway).
> >
> > This all matches with the PASID life time described in the commit message.
>
> But what does this mean for the user that failed that intel_svm_bind_mm()?
>
That means the user may have a PASID but there is no PASID entry for the
device. Then ENQCMD cannot be executed successfully.
> Here's a scenario:
>
> Process sets up to use PASID capable device #1. Everything works,
> so the process has mm->pasid, and the IOMMU has the tables to map
> virtual addresses coming from device #1 using that PASID.
>
> Now the same process asks to start using PASID capable device #2,
> but there is a failure at intel_svm_bind_mm().
>
> Fenghua is right that we shouldn't free the PASID. It is in use
> by at least one thread of the process to access device #1.
>
> But what happens with device #2? Does the caller of intel_svm_bind()
> do the right thing with the IS_ERR_OR_NULL return value to let the
> user know that device #2 isn't accessible?
A driver caller of intel_svm_bind() should handle this failure, i.e. fail
the binding and let the user know the failure.
Even if the driver doesn't do the right thing to handle the failure,
intel_svm_bind() doesn't set up a PASID entry for device #2.
One example is IDXD driver. User calls open()->IDXD driver idxd_cdev_open()
->intel_svm_bind()->intel_svm_bind_mm(). idxd_cdev_open() gets failed "sva"
and passes the PTR_ERR(sva) to the user and the user cannot open the device.
Due to the failure, no PASID entry is set up for the device.
Even if the user ignores the open() failure and tries to run ENQCMD on
device #2, a PASID table fault will be generated due to no PASID entry
for the device and the ENQCMD execution will fail.
Thanks.
-Fenghua
On Wed, 9 Feb 2022 19:16:14 -0800, Jacob Pan
<[email protected]> wrote:
> Hi Fenghua,
>
> On Mon, 7 Feb 2022 15:02:48 -0800, Fenghua Yu <[email protected]>
> wrote:
>
> > @@ -1047,8 +1040,6 @@ struct iommu_sva *intel_svm_bind(struct device
> > *dev, struct mm_struct *mm, void }
> >
> > sva = intel_svm_bind_mm(iommu, dev, mm, flags);
> > - if (IS_ERR_OR_NULL(sva))
> > - intel_svm_free_pasid(mm);
> If bind fails, the PASID has no IOMMU nor CPU context. It should be safe
> to free here.
>
Actually, here we cannot tell if the bind is the first of the mm so I think
this is fine.
Reviewed-by: Jacob Pan <[email protected]>
> Thanks,
>
> Jacob
Thanks,
Jacob
On Thu, Feb 10, 2022 at 10:49:04AM -0800, Jacob Pan wrote:
>
> On Wed, 9 Feb 2022 19:16:14 -0800, Jacob Pan
> <[email protected]> wrote:
>
> > Hi Fenghua,
> >
> > On Mon, 7 Feb 2022 15:02:48 -0800, Fenghua Yu <[email protected]>
> > wrote:
> >
> > > @@ -1047,8 +1040,6 @@ struct iommu_sva *intel_svm_bind(struct device
> > > *dev, struct mm_struct *mm, void }
> > >
> > > sva = intel_svm_bind_mm(iommu, dev, mm, flags);
> > > - if (IS_ERR_OR_NULL(sva))
> > > - intel_svm_free_pasid(mm);
> > If bind fails, the PASID has no IOMMU nor CPU context. It should be safe
> > to free here.
> >
> Actually, here we cannot tell if the bind is the first of the mm so I think
> this is fine.
>
> Reviewed-by: Jacob Pan <[email protected]>
Thank you very much for your review, Jacob!
-Fenghua
Hi, Jacob,
On Wed, Feb 09, 2022 at 07:16:14PM -0800, Jacob Pan wrote:
> Hi Fenghua,
>
> On Mon, 7 Feb 2022 15:02:48 -0800, Fenghua Yu <[email protected]> wrote:
>
> > @@ -1047,8 +1040,6 @@ struct iommu_sva *intel_svm_bind(struct device
> > *dev, struct mm_struct *mm, void }
> >
> > sva = intel_svm_bind_mm(iommu, dev, mm, flags);
> > - if (IS_ERR_OR_NULL(sva))
> > - intel_svm_free_pasid(mm);
> If bind fails, the PASID has no IOMMU nor CPU context. It should be safe to
> free here.
The PASID can not be freed even if bind fails. The PASID allocated earlier
(either in this thread or in another thread) might be populated to other
threads already and being used now.
Without freeing the PASID on bind failure, the worst case is the PASID might
not be used in the process (and will be freed on process exit anyway).
This all matches with the PASID life time described in the commit message.
Thanks.
-Fenghua
Hi, Tony,
On Thu, Feb 10, 2022 at 10:31:42AM -0800, Fenghua Yu wrote:
>
> On Thu, Feb 10, 2022 at 09:24:50AM -0800, Luck, Tony wrote:
> > On Thu, Feb 10, 2022 at 08:27:50AM -0800, Fenghua Yu wrote:
> > > Hi, Jacob,
> > >
> > > On Wed, Feb 09, 2022 at 07:16:14PM -0800, Jacob Pan wrote:
> > > > Hi Fenghua,
> > > >
> > > > On Mon, 7 Feb 2022 15:02:48 -0800, Fenghua Yu <[email protected]> wrote:
> > > >
> > > > > @@ -1047,8 +1040,6 @@ struct iommu_sva *intel_svm_bind(struct device
> > > > > *dev, struct mm_struct *mm, void }
> > > > >
> > > > > sva = intel_svm_bind_mm(iommu, dev, mm, flags);
> > > > > - if (IS_ERR_OR_NULL(sva))
> > > > > - intel_svm_free_pasid(mm);
> > > > If bind fails, the PASID has no IOMMU nor CPU context. It should be safe to
> > > > free here.
> > >
> > > The PASID can not be freed even if bind fails. The PASID allocated earlier
> > > (either in this thread or in another thread) might be populated to other
> > > threads already and being used now.
> > >
> > > Without freeing the PASID on bind failure, the worst case is the PASID might
> > > not be used in the process (and will be freed on process exit anyway).
> > >
> > > This all matches with the PASID life time described in the commit message.
> >
> > But what does this mean for the user that failed that intel_svm_bind_mm()?
> >
>
> That means the user may have a PASID but there is no PASID entry for the
> device. Then ENQCMD cannot be executed successfully.
>
> > Here's a scenario:
> >
> > Process sets up to use PASID capable device #1. Everything works,
> > so the process has mm->pasid, and the IOMMU has the tables to map
> > virtual addresses coming from device #1 using that PASID.
> >
> > Now the same process asks to start using PASID capable device #2,
> > but there is a failure at intel_svm_bind_mm().
> >
> > Fenghua is right that we shouldn't free the PASID. It is in use
> > by at least one thread of the process to access device #1.
> >
> > But what happens with device #2? Does the caller of intel_svm_bind()
> > do the right thing with the IS_ERR_OR_NULL return value to let the
> > user know that device #2 isn't accessible?
>
> A driver caller of intel_svm_bind() should handle this failure, i.e. fail
> the binding and let the user know the failure.
>
> Even if the driver doesn't do the right thing to handle the failure,
> intel_svm_bind() doesn't set up a PASID entry for device #2.
>
> One example is IDXD driver. User calls open()->IDXD driver idxd_cdev_open()
> ->intel_svm_bind()->intel_svm_bind_mm(). idxd_cdev_open() gets failed "sva"
> and passes the PTR_ERR(sva) to the user and the user cannot open the device.
> Due to the failure, no PASID entry is set up for the device.
>
> Even if the user ignores the open() failure and tries to run ENQCMD on
> device #2, a PASID table fault will be generated due to no PASID entry
> for the device and the ENQCMD execution will fail.
Plus, the above behavior of handling intel_svm_bind_mm() failure is expected
right behavior and the current series doesn't need to be changed.
Thanks.
-Fenghua
On 2/7/22 15:02, Fenghua Yu wrote:
...
> Get rid of the refcounting mechanisms and replace/rename the interfaces
> to reflect this new approach.
...
> .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 5 +--
> drivers/iommu/intel/iommu.c | 4 +-
> drivers/iommu/intel/svm.c | 9 -----
> drivers/iommu/ioasid.c | 39 ++-----------------
> drivers/iommu/iommu-sva-lib.c | 39 ++++++-------------
> drivers/iommu/iommu-sva-lib.h | 1 -
> include/linux/ioasid.h | 12 +-----
> include/linux/sched/mm.h | 16 ++++++++
> kernel/fork.c | 1 +
> 9 files changed, 38 insertions(+), 88 deletions(-)
Given the heavily non-x86 diffstat here, I was hoping to see some acks
from folks that this might affect, especially on the ARM side.
Is everyone OK with this?
On Mon, Feb 07 2022 at 15:02, Fenghua Yu wrote:
> PASIDs are process wide. It was attempted to use refcounted PASIDs to
> free them when the last thread drops the refcount. This turned out to
> be complex and error prone. Given the fact that the PASID space is 20
> bits, which allows up to 1M processes to have a PASID associated
> concurrently, PASID resource exhaustion is not a realistic concern.
>
> Therefore it was decided to simplify the approach and stick with lazy
> on demand PASID allocation, but drop the eager free approach and make
> a allocated PASID lifetime bound to the life time of the process.
>
> Get rid of the refcounting mechanisms and replace/rename the interfaces
> to reflect this new approach.
>
> Suggested-by: Dave Hansen <[email protected]>
> Signed-off-by: Fenghua Yu <[email protected]>
> Reviewed-by: Tony Luck <[email protected]>
Reviewed-by: Thomas Gleixner <[email protected]>
On Mon, Feb 07, 2022 at 03:02:48PM -0800, Fenghua Yu wrote:
> PASIDs are process wide. It was attempted to use refcounted PASIDs to
> free them when the last thread drops the refcount. This turned out to
> be complex and error prone. Given the fact that the PASID space is 20
> bits, which allows up to 1M processes to have a PASID associated
> concurrently, PASID resource exhaustion is not a realistic concern.
>
> Therefore it was decided to simplify the approach and stick with lazy
> on demand PASID allocation, but drop the eager free approach and make
> a allocated PASID lifetime bound to the life time of the process.
>
> Get rid of the refcounting mechanisms and replace/rename the interfaces
> to reflect this new approach.
>
> Suggested-by: Dave Hansen <[email protected]>
> Signed-off-by: Fenghua Yu <[email protected]>
> Reviewed-by: Tony Luck <[email protected]>
Acked-by: Joerg Roedel <[email protected]>
The following commit has been merged into the x86/pasid branch of tip:
Commit-ID: 701fac40384f07197b106136012804c3cae0b3de
Gitweb: https://git.kernel.org/tip/701fac40384f07197b106136012804c3cae0b3de
Author: Fenghua Yu <[email protected]>
AuthorDate: Mon, 07 Feb 2022 15:02:48 -08:00
Committer: Borislav Petkov <[email protected]>
CommitterDate: Tue, 15 Feb 2022 11:31:35 +01:00
iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm exit
PASIDs are process-wide. It was attempted to use refcounted PASIDs to
free them when the last thread drops the refcount. This turned out to
be complex and error prone. Given the fact that the PASID space is 20
bits, which allows up to 1M processes to have a PASID associated
concurrently, PASID resource exhaustion is not a realistic concern.
Therefore, it was decided to simplify the approach and stick with lazy
on demand PASID allocation, but drop the eager free approach and make an
allocated PASID's lifetime bound to the lifetime of the process.
Get rid of the refcounting mechanisms and replace/rename the interfaces
to reflect this new approach.
[ bp: Massage commit message. ]
Suggested-by: Dave Hansen <[email protected]>
Signed-off-by: Fenghua Yu <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Reviewed-by: Tony Luck <[email protected]>
Reviewed-by: Lu Baolu <[email protected]>
Reviewed-by: Jacob Pan <[email protected]>
Reviewed-by: Thomas Gleixner <[email protected]>
Acked-by: Joerg Roedel <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 5 +--
drivers/iommu/intel/iommu.c | 4 +-
drivers/iommu/intel/svm.c | 9 +----
drivers/iommu/ioasid.c | 39 +---------------
drivers/iommu/iommu-sva-lib.c | 39 ++++------------
drivers/iommu/iommu-sva-lib.h | 1 +-
include/linux/ioasid.h | 12 +-----
include/linux/sched/mm.h | 16 +++++++-
kernel/fork.c | 1 +-
9 files changed, 38 insertions(+), 88 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index a737ba5..22ddd05 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -340,14 +340,12 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm)
bond->smmu_mn = arm_smmu_mmu_notifier_get(smmu_domain, mm);
if (IS_ERR(bond->smmu_mn)) {
ret = PTR_ERR(bond->smmu_mn);
- goto err_free_pasid;
+ goto err_free_bond;
}
list_add(&bond->list, &master->bonds);
return &bond->sva;
-err_free_pasid:
- iommu_sva_free_pasid(mm);
err_free_bond:
kfree(bond);
return ERR_PTR(ret);
@@ -377,7 +375,6 @@ void arm_smmu_sva_unbind(struct iommu_sva *handle)
if (refcount_dec_and_test(&bond->refs)) {
list_del(&bond->list);
arm_smmu_mmu_notifier_put(bond->smmu_mn);
- iommu_sva_free_pasid(bond->mm);
kfree(bond);
}
mutex_unlock(&sva_lock);
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 92fea3f..ef03b21 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4781,7 +4781,7 @@ attach_failed:
link_failed:
spin_unlock_irqrestore(&device_domain_lock, flags);
if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
- ioasid_put(domain->default_pasid);
+ ioasid_free(domain->default_pasid);
return ret;
}
@@ -4811,7 +4811,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain,
spin_unlock_irqrestore(&device_domain_lock, flags);
if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
- ioasid_put(domain->default_pasid);
+ ioasid_free(domain->default_pasid);
}
static int prepare_domain_attach_device(struct iommu_domain *domain,
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 5b5d69b..51ac209 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -514,11 +514,6 @@ static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm,
return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1);
}
-static void intel_svm_free_pasid(struct mm_struct *mm)
-{
- iommu_sva_free_pasid(mm);
-}
-
static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
struct device *dev,
struct mm_struct *mm,
@@ -662,8 +657,6 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
kfree(svm);
}
}
- /* Drop a PASID reference and free it if no reference. */
- intel_svm_free_pasid(mm);
}
out:
return ret;
@@ -1047,8 +1040,6 @@ struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void
}
sva = intel_svm_bind_mm(iommu, dev, mm, flags);
- if (IS_ERR_OR_NULL(sva))
- intel_svm_free_pasid(mm);
mutex_unlock(&pasid_mutex);
return sva;
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 06fee74..a786c03 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -2,7 +2,7 @@
/*
* I/O Address Space ID allocator. There is one global IOASID space, split into
* subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and
- * free IOASIDs with ioasid_alloc and ioasid_put.
+ * free IOASIDs with ioasid_alloc() and ioasid_free().
*/
#include <linux/ioasid.h>
#include <linux/module.h>
@@ -15,7 +15,6 @@ struct ioasid_data {
struct ioasid_set *set;
void *private;
struct rcu_head rcu;
- refcount_t refs;
};
/*
@@ -315,7 +314,6 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
data->set = set;
data->private = private;
- refcount_set(&data->refs, 1);
/*
* Custom allocator needs allocator data to perform platform specific
@@ -348,35 +346,11 @@ exit_free:
EXPORT_SYMBOL_GPL(ioasid_alloc);
/**
- * ioasid_get - obtain a reference to the IOASID
- * @ioasid: the ID to get
- */
-void ioasid_get(ioasid_t ioasid)
-{
- struct ioasid_data *ioasid_data;
-
- spin_lock(&ioasid_allocator_lock);
- ioasid_data = xa_load(&active_allocator->xa, ioasid);
- if (ioasid_data)
- refcount_inc(&ioasid_data->refs);
- else
- WARN_ON(1);
- spin_unlock(&ioasid_allocator_lock);
-}
-EXPORT_SYMBOL_GPL(ioasid_get);
-
-/**
- * ioasid_put - Release a reference to an ioasid
+ * ioasid_free - Free an ioasid
* @ioasid: the ID to remove
- *
- * Put a reference to the IOASID, free it when the number of references drops to
- * zero.
- *
- * Return: %true if the IOASID was freed, %false otherwise.
*/
-bool ioasid_put(ioasid_t ioasid)
+void ioasid_free(ioasid_t ioasid)
{
- bool free = false;
struct ioasid_data *ioasid_data;
spin_lock(&ioasid_allocator_lock);
@@ -386,10 +360,6 @@ bool ioasid_put(ioasid_t ioasid)
goto exit_unlock;
}
- free = refcount_dec_and_test(&ioasid_data->refs);
- if (!free)
- goto exit_unlock;
-
active_allocator->ops->free(ioasid, active_allocator->ops->pdata);
/* Custom allocator needs additional steps to free the xa element */
if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) {
@@ -399,9 +369,8 @@ bool ioasid_put(ioasid_t ioasid)
exit_unlock:
spin_unlock(&ioasid_allocator_lock);
- return free;
}
-EXPORT_SYMBOL_GPL(ioasid_put);
+EXPORT_SYMBOL_GPL(ioasid_free);
/**
* ioasid_find - Find IOASID data
diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c
index bd41405..1065061 100644
--- a/drivers/iommu/iommu-sva-lib.c
+++ b/drivers/iommu/iommu-sva-lib.c
@@ -18,8 +18,7 @@ static DECLARE_IOASID_SET(iommu_sva_pasid);
*
* Try to allocate a PASID for this mm, or take a reference to the existing one
* provided it fits within the [@min, @max] range. On success the PASID is
- * available in mm->pasid, and must be released with iommu_sva_free_pasid().
- * @min must be greater than 0, because 0 indicates an unused mm->pasid.
+ * available in mm->pasid and will be available for the lifetime of the mm.
*
* Returns 0 on success and < 0 on error.
*/
@@ -33,38 +32,24 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max)
return -EINVAL;
mutex_lock(&iommu_sva_lock);
- if (mm->pasid) {
- if (mm->pasid >= min && mm->pasid <= max)
- ioasid_get(mm->pasid);
- else
+ /* Is a PASID already associated with this mm? */
+ if (pasid_valid(mm->pasid)) {
+ if (mm->pasid < min || mm->pasid >= max)
ret = -EOVERFLOW;
- } else {
- pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
- if (pasid == INVALID_IOASID)
- ret = -ENOMEM;
- else
- mm->pasid = pasid;
+ goto out;
}
+
+ pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm);
+ if (!pasid_valid(pasid))
+ ret = -ENOMEM;
+ else
+ mm_pasid_set(mm, pasid);
+out:
mutex_unlock(&iommu_sva_lock);
return ret;
}
EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid);
-/**
- * iommu_sva_free_pasid - Release the mm's PASID
- * @mm: the mm
- *
- * Drop one reference to a PASID allocated with iommu_sva_alloc_pasid()
- */
-void iommu_sva_free_pasid(struct mm_struct *mm)
-{
- mutex_lock(&iommu_sva_lock);
- if (ioasid_put(mm->pasid))
- mm->pasid = 0;
- mutex_unlock(&iommu_sva_lock);
-}
-EXPORT_SYMBOL_GPL(iommu_sva_free_pasid);
-
/* ioasid_find getter() requires a void * argument */
static bool __mmget_not_zero(void *mm)
{
diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva-lib.h
index 95dc3eb..8909ea1 100644
--- a/drivers/iommu/iommu-sva-lib.h
+++ b/drivers/iommu/iommu-sva-lib.h
@@ -9,7 +9,6 @@
#include <linux/mm_types.h>
int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max);
-void iommu_sva_free_pasid(struct mm_struct *mm);
struct mm_struct *iommu_sva_find(ioasid_t pasid);
/* I/O Page fault */
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 2237f64..af1c9d6 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -34,8 +34,7 @@ struct ioasid_allocator_ops {
#if IS_ENABLED(CONFIG_IOASID)
ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
void *private);
-void ioasid_get(ioasid_t ioasid);
-bool ioasid_put(ioasid_t ioasid);
+void ioasid_free(ioasid_t ioasid);
void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
bool (*getter)(void *));
int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
@@ -53,14 +52,7 @@ static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
return INVALID_IOASID;
}
-static inline void ioasid_get(ioasid_t ioasid)
-{
-}
-
-static inline bool ioasid_put(ioasid_t ioasid)
-{
- return false;
-}
+static inline void ioasid_free(ioasid_t ioasid) { }
static inline void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
bool (*getter)(void *))
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index c74d1ed..a80356e 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -439,8 +439,24 @@ static inline void mm_pasid_init(struct mm_struct *mm)
{
mm->pasid = INVALID_IOASID;
}
+
+/* Associate a PASID with an mm_struct: */
+static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid)
+{
+ mm->pasid = pasid;
+}
+
+static inline void mm_pasid_drop(struct mm_struct *mm)
+{
+ if (pasid_valid(mm->pasid)) {
+ ioasid_free(mm->pasid);
+ mm->pasid = INVALID_IOASID;
+ }
+}
#else
static inline void mm_pasid_init(struct mm_struct *mm) {}
+static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) {}
+static inline void mm_pasid_drop(struct mm_struct *mm) {}
#endif
#endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index deacd2c..c03c668 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1115,6 +1115,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ mm_pasid_drop(mm);
mmdrop(mm);
}
On 2022/4/11 下午10:10, Dave Hansen wrote:
> On 4/11/22 07:00, Zhangfei Gao wrote:
>> with this patchset, each time after sbin/nginx, ioasid is freed
>> immediately. lynx test will alloc the same ioasid=1.
> That doesn't seem right. Isn't 'sbin/nginx' still running when lynx
> runs? How can they get the same ioasid?
Yes, sbin/nginx is still running,
root 3228 0.0 0.0 31316 13476 ? Ssl 12:50 0:00
nginx: master process sbin/nginx
nobody 3230 0.0 0.0 32296 16456 ? Sl 12:50 0:00
nginx: worker process
Since ioasid is freed, so lynx can get the same ioasid.
>
> This sounds like a refcounting problem, like that the ioasid wasn't
> properly refcounted as nginx forked into the background.
Yes, in checking, thanks
and this patchset removed the old refcount.
>
>> To verify, hack comment mm_pasid_drop in __mmput will make the issue
>> disappear.
>>
>> log: after sbin/nginx.
>> [ 96.526730] Call trace:
>> [ 96.526732] dump_backtrace+0xe4/0xf0
>> [ 96.526741] show_stack+0x20/0x70
>> [ 96.526744] dump_stack_lvl+0x8c/0xb8
>> [ 96.526751] dump_stack+0x18/0x34
>> [ 96.526754] ioasid_free+0xdc/0xfc
>> [ 96.526757] mmput+0x138/0x160
>> [ 96.526760] do_exit+0x284/0x9d0
>> [ 96.526765] do_group_exit+0x3c/0xa8
>> [ 96.526767] __wake_up_parent+0x0/0x38
>> [ 96.526770] invoke_syscall+0x4c/0x110
>> [ 96.526775] el0_svc_common.constprop.0+0x68/0x128
>> [ 96.526778] do_el0_svc+0x2c/0x90
>> [ 96.526781] el0_svc+0x30/0x98
>> [ 96.526783] el0t_64_sync_handler+0xb0/0xb8
>> [ 96.526785] el0t_64_sync+0x18c/0x190
> Is there nothing before this call trace? Usually there will be at least
> some warning text.
I added dump_stack() in ioasid_free.
Thanks
On 2022/4/11 下午10:36, Dave Hansen wrote:
> On 4/11/22 07:20, [email protected] wrote:
>>> Is there nothing before this call trace? Usually there will be at least
>>> some warning text.
>> I added dump_stack() in ioasid_free.
> Hold on a sec, though...
>
> What's the *problem* here? Did something break or are you just saying
> that something looks weird to _you_?
After this, nginx is not working at all, and hardware reports error.
Suppose the the master use the ioasid for init, but got freed.
hardware reports:
[ 152.731869] hisi_sec2 0000:76:00.0: qm_acc_do_task_timeout [error
status=0x20] found
[ 152.739657] hisi_sec2 0000:76:00.0: qm_acc_wb_not_ready_timeout
[error status=0x40] found
[ 152.747877] hisi_sec2 0000:76:00.0: sec_fsm_hbeat_rint [error
status=0x20] found
[ 152.755340] hisi_sec2 0000:76:00.0: Controller resetting...
[ 152.762044] hisi_sec2 0000:76:00.0: QM mailbox operation timeout!
[ 152.768198] hisi_sec2 0000:76:00.0: Failed to dump sqc!
[ 152.773490] hisi_sec2 0000:76:00.0: Failed to drain out data for
stopping!
[ 152.781426] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
[ 152.787468] hisi_sec2 0000:76:00.0: Failed to dump sqc!
[ 152.792753] hisi_sec2 0000:76:00.0: Failed to drain out data for
stopping!
[ 152.800685] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
[ 152.806730] hisi_sec2 0000:76:00.0: Failed to dump sqc!
[ 152.812017] hisi_sec2 0000:76:00.0: Failed to drain out data for
stopping!
[ 152.819946] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
[ 152.825992] hisi_sec2 0000:76:00.0: Failed to dump sqc!
Thanks
>
> For instance, if we have:
>
> nginx: ioasid_alloc()==1
> fork(); // spawn the daemon
> exit();
> ioasid_free(1);
>
> and then a moment later:
>
> lynx: ioasid_alloc()==1
> fork();
> exit();
> ioasid_free(1);
>
> There's no problem. The original nginx process with ioasid==1 exited.
> The fact that *some* process called nginx is still running doesn't mean
> that it still has a reference to asid==1.
>
> Are you sure the nginx process that allocated the ASID is the same
> process you see in ps?
On 4/11/22 07:00, Zhangfei Gao wrote:
> with this patchset, each time after sbin/nginx, ioasid is freed
> immediately. lynx test will alloc the same ioasid=1.
That doesn't seem right. Isn't 'sbin/nginx' still running when lynx
runs? How can they get the same ioasid?
This sounds like a refcounting problem, like that the ioasid wasn't
properly refcounted as nginx forked into the background.
> To verify, hack comment mm_pasid_drop in __mmput will make the issue
> disappear.
>
> log: after sbin/nginx.
> [ 96.526730] Call trace:
> [ 96.526732] dump_backtrace+0xe4/0xf0
> [ 96.526741] show_stack+0x20/0x70
> [ 96.526744] dump_stack_lvl+0x8c/0xb8
> [ 96.526751] dump_stack+0x18/0x34
> [ 96.526754] ioasid_free+0xdc/0xfc
> [ 96.526757] mmput+0x138/0x160
> [ 96.526760] do_exit+0x284/0x9d0
> [ 96.526765] do_group_exit+0x3c/0xa8
> [ 96.526767] __wake_up_parent+0x0/0x38
> [ 96.526770] invoke_syscall+0x4c/0x110
> [ 96.526775] el0_svc_common.constprop.0+0x68/0x128
> [ 96.526778] do_el0_svc+0x2c/0x90
> [ 96.526781] el0_svc+0x30/0x98
> [ 96.526783] el0t_64_sync_handler+0xb0/0xb8
> [ 96.526785] el0t_64_sync+0x18c/0x190
Is there nothing before this call trace? Usually there will be at least
some warning text.
On 4/12/22 00:04, [email protected] wrote:
> master process quit, mmput -> mm_pasid_drop->ioasid_free
> But this ignore driver's iommu_sva_unbind_device function,
> iommu_sva_bind_device and iommu_sva_unbind_device are not pair, So
> driver does not know ioasid is freed.
>
> Any suggestion?
It sounds like you're saying that the device is still abound to the
PASID even though the mm has exited and freed the PASID. This is
essentially a use-after-free for the PASID. Right?
The right thing to do here is to have the PASID code hold a reference on
the mm. The mm "owns" the PASID for its entire lifetime and if anything
needs the PASID to live longer, its only recourse for doing that is via
an mmget(). I _think_ mmget() is the right thing as opposed to mmgrab()
because the PASID users actually need the page tables to be around.
This would still be nice to confirm with some traces of fork()/exit()
and the iommu_sva_{bind,unbind} and ioasid_{alloc,free} functions.
I wonder if the Intel and ARM IOMMU code differ in the way they keep
references to the mm, or if this affects Intel as well, but we just
haven't tested the code enough.
On 4/11/22 07:20, [email protected] wrote:
>> Is there nothing before this call trace? Usually there will be at least
>> some warning text.
> I added dump_stack() in ioasid_free.
Hold on a sec, though...
What's the *problem* here? Did something break or are you just saying
that something looks weird to _you_?
For instance, if we have:
nginx: ioasid_alloc()==1
fork(); // spawn the daemon
exit();
ioasid_free(1);
and then a moment later:
lynx: ioasid_alloc()==1
fork();
exit();
ioasid_free(1);
There's no problem. The original nginx process with ioasid==1 exited.
The fact that *some* process called nginx is still running doesn't mean
that it still has a reference to asid==1.
Are you sure the nginx process that allocated the ASID is the same
process you see in ps?
On 4/12/22 08:10, Jean-Philippe Brucker wrote:
>> I wonder if the Intel and ARM IOMMU code differ in the way they keep
>> references to the mm, or if this affects Intel as well, but we just
>> haven't tested the code enough.
> The Arm code was written expecting the PASID to be freed on unbind(), not
> mm exit. I missed the change of behavior, sorry (I thought your plan was
> to extend PASID lifetime, not shorten it?) but as is it seems very broken.
> For example in the iommu_sva_unbind_device(), we have
> arm_smmu_mmu_notifier_put() clearing the PASID table entry for
> "mm->pasid", which is going to end badly if the PASID has been cleared or
> reallocated. We can't clear the PASID entry in mm exit because at that
> point the device may still be issuing DMA for that PASID and we need to
> quiesce the entry rather than deactivate it.
I think we ended up flipping some of this around on the Intel side.
Instead of having to quiesce the device on mm exit, we don't let the mm
exit until the device is done.
When you program the pasid into the device, it's a lot like when you
create a thread. We bump the reference count on the mm when we program
the page table pointer into a CPU. We drop the thread's reference to
the mm when the thread exits and will no longer be using the page tables.
Same thing with pasids. We bump the refcount on the mm when the pasid
is programmed into the device. Once the device is done with the mm, we
drop the mm.
Basically, instead of recounting the pasid itself, we just refcount the mm.
On 4/11/22 07:44, [email protected] wrote:
> On 2022/4/11 下午10:36, Dave Hansen wrote:
>> On 4/11/22 07:20, [email protected] wrote:
>>>> Is there nothing before this call trace? Usually there will be at least
>>>> some warning text.
>>> I added dump_stack() in ioasid_free.
>> Hold on a sec, though...
>>
>> What's the *problem* here? Did something break or are you just saying
>> that something looks weird to _you_?
>
> After this, nginx is not working at all, and hardware reports error.
> Suppose the the master use the ioasid for init, but got freed.
>
> hardware reports:
> [ 152.731869] hisi_sec2 0000:76:00.0: qm_acc_do_task_timeout [error status=0x20] found
> [ 152.739657] hisi_sec2 0000:76:00.0: qm_acc_wb_not_ready_timeout [error status=0x40] found
> [ 152.747877] hisi_sec2 0000:76:00.0: sec_fsm_hbeat_rint [error status=0x20] found
> [ 152.755340] hisi_sec2 0000:76:00.0: Controller resetting...
> [ 152.762044] hisi_sec2 0000:76:00.0: QM mailbox operation timeout!
> [ 152.768198] hisi_sec2 0000:76:00.0: Failed to dump sqc!
> [ 152.773490] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
> [ 152.781426] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
> [ 152.787468] hisi_sec2 0000:76:00.0: Failed to dump sqc!
> [ 152.792753] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
> [ 152.800685] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
> [ 152.806730] hisi_sec2 0000:76:00.0: Failed to dump sqc!
> [ 152.812017] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
> [ 152.819946] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
> [ 152.825992] hisi_sec2 0000:76:00.0: Failed to dump sqc!
That would have been awfully handy information to have in an initial bug report. :)
Is there a chance you could dump out that ioasid alloc *and* free information in ioasid_alloc/free()? This could be some kind of problem with the allocator, or with copying the ioasid at fork.
On 4/12/22 06:41, Fenghua Yu wrote:
>> master process quit, mmput -> mm_pasid_drop->ioasid_free
>> But this ignore driver's iommu_sva_unbind_device function,
>> iommu_sva_bind_device and iommu_sva_unbind_device are not pair, So driver
>> does not know ioasid is freed.
>>
>> Any suggestion?
> ioasid is per process or per mm. A daemon process shouldn't share the same
> ioasid with any other process with even its parent process. Its parent gets
> an ioasid and frees it on exit. The ioasid is gone and shouldn't be used
> by its child process.
>
> Each daemon process should call driver -> iommu_sva_bind_device -> ioasid_alloc
> to get its own ioasid/PASID. On daemon quit, the ioasid is freed.
>
> That means nqnix needs to be changed.
Fenghua, please step back for a second and look at what you are saying.
Your patch caused userspace to break. Now, you're telling someone that
they need to go change that userspace to work around something that your
patch. How, exactly, are you suggesting that nginx could change to fix
this? What, specifically, was it doing with *fork()* that was wrong?
It sounds to me like you're saying that it's OK to break userspace.
On 2022/4/11 下午10:52, Dave Hansen wrote:
> On 4/11/22 07:44, [email protected] wrote:
>> On 2022/4/11 下午10:36, Dave Hansen wrote:
>>> On 4/11/22 07:20, [email protected] wrote:
>>>>> Is there nothing before this call trace? Usually there will be at least
>>>>> some warning text.
>>>> I added dump_stack() in ioasid_free.
>>> Hold on a sec, though...
>>>
>>> What's the *problem* here? Did something break or are you just saying
>>> that something looks weird to _you_?
>> After this, nginx is not working at all, and hardware reports error.
>> Suppose the the master use the ioasid for init, but got freed.
>>
>> hardware reports:
>> [ 152.731869] hisi_sec2 0000:76:00.0: qm_acc_do_task_timeout [error status=0x20] found
>> [ 152.739657] hisi_sec2 0000:76:00.0: qm_acc_wb_not_ready_timeout [error status=0x40] found
>> [ 152.747877] hisi_sec2 0000:76:00.0: sec_fsm_hbeat_rint [error status=0x20] found
>> [ 152.755340] hisi_sec2 0000:76:00.0: Controller resetting...
>> [ 152.762044] hisi_sec2 0000:76:00.0: QM mailbox operation timeout!
>> [ 152.768198] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>> [ 152.773490] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
>> [ 152.781426] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
>> [ 152.787468] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>> [ 152.792753] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
>> [ 152.800685] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
>> [ 152.806730] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>> [ 152.812017] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
>> [ 152.819946] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
>> [ 152.825992] hisi_sec2 0000:76:00.0: Failed to dump sqc!
> That would have been awfully handy information to have in an initial bug report. :)
> Is there a chance you could dump out that ioasid alloc *and* free information in ioasid_alloc/free()? This could be some kind of problem with the allocator, or with copying the ioasid at fork.
The issue is nginx master process init resource, start daemon process,
then master process quit and free ioasid.
The daemon nginx process is not the original master process.
master process: init resource
driver -> iommu_sva_bind_device -> ioasid_alloc
nginx : ngx_daemon
fork daemon, without add mm's refcount.
src/os/unix/ngx_daemon.c
ngx_daemon(ngx_log_t *log)
{
int fd;
switch (fork()) {
case -1:
ngx_log_error(NGX_LOG_EMERG, log, ngx_errno, "fork() failed");
return NGX_ERROR;
case 0: // here master process is quit directly and will be
released.
break;
default:
exit(0);
}
// here daemon process take control.
ngx_parent = ngx_pid;
ngx_pid = ngx_getpid();
fork.c
copy_mm
if (clone_flags & CLONE_VM) {
mmget(oldmm);
mm = oldmm;
} else {
mm = dup_mm(tsk, current->mm); // here daemon
process handling without mmget.
master process quit, mmput -> mm_pasid_drop->ioasid_free
But this ignore driver's iommu_sva_unbind_device function,
iommu_sva_bind_device and iommu_sva_unbind_device are not pair, So
driver does not know ioasid is freed.
Any suggestion?
Or can we still use the original ioasid refcount mechanism?
Thanks
On 2022/4/11 下午10:52, Dave Hansen wrote:
> On 4/11/22 07:44, [email protected] wrote:
>> On 2022/4/11 下午10:36, Dave Hansen wrote:
>>> On 4/11/22 07:20, [email protected] wrote:
>>>>> Is there nothing before this call trace? Usually there will be at least
>>>>> some warning text.
>>>> I added dump_stack() in ioasid_free.
>>> Hold on a sec, though...
>>>
>>> What's the *problem* here? Did something break or are you just saying
>>> that something looks weird to _you_?
>> After this, nginx is not working at all, and hardware reports error.
>> Suppose the the master use the ioasid for init, but got freed.
>>
>> hardware reports:
>> [ 152.731869] hisi_sec2 0000:76:00.0: qm_acc_do_task_timeout [error status=0x20] found
>> [ 152.739657] hisi_sec2 0000:76:00.0: qm_acc_wb_not_ready_timeout [error status=0x40] found
>> [ 152.747877] hisi_sec2 0000:76:00.0: sec_fsm_hbeat_rint [error status=0x20] found
>> [ 152.755340] hisi_sec2 0000:76:00.0: Controller resetting...
>> [ 152.762044] hisi_sec2 0000:76:00.0: QM mailbox operation timeout!
>> [ 152.768198] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>> [ 152.773490] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
>> [ 152.781426] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
>> [ 152.787468] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>> [ 152.792753] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
>> [ 152.800685] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
>> [ 152.806730] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>> [ 152.812017] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
>> [ 152.819946] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
>> [ 152.825992] hisi_sec2 0000:76:00.0: Failed to dump sqc!
> That would have been awfully handy information to have in an initial bug report. :)
> Is there a chance you could dump out that ioasid alloc *and* free information in ioasid_alloc/free()? This could be some kind of problem with the allocator, or with copying the ioasid at fork.
Sure, will add some trace tomorrow.
Thanks Dave
Hi,
On Tue, Apr 12, 2022 at 07:36:21AM -0700, Dave Hansen wrote:
> On 4/12/22 00:04, [email protected] wrote:
> > master process quit, mmput -> mm_pasid_drop->ioasid_free
> > But this ignore driver's iommu_sva_unbind_device function,
> > iommu_sva_bind_device and iommu_sva_unbind_device are not pair, So
> > driver does not know ioasid is freed.
> >
> > Any suggestion?
>
> It sounds like you're saying that the device is still abound to the
> PASID even though the mm has exited and freed the PASID. This is
> essentially a use-after-free for the PASID. Right?
>
> The right thing to do here is to have the PASID code hold a reference on
> the mm. The mm "owns" the PASID for its entire lifetime and if anything
> needs the PASID to live longer, its only recourse for doing that is via
> an mmget(). I _think_ mmget() is the right thing as opposed to mmgrab()
> because the PASID users actually need the page tables to be around.
>
> This would still be nice to confirm with some traces of fork()/exit()
> and the iommu_sva_{bind,unbind} and ioasid_{alloc,free} functions.
>
> I wonder if the Intel and ARM IOMMU code differ in the way they keep
> references to the mm, or if this affects Intel as well, but we just
> haven't tested the code enough.
The Arm code was written expecting the PASID to be freed on unbind(), not
mm exit. I missed the change of behavior, sorry (I thought your plan was
to extend PASID lifetime, not shorten it?) but as is it seems very broken.
For example in the iommu_sva_unbind_device(), we have
arm_smmu_mmu_notifier_put() clearing the PASID table entry for
"mm->pasid", which is going to end badly if the PASID has been cleared or
reallocated. We can't clear the PASID entry in mm exit because at that
point the device may still be issuing DMA for that PASID and we need to
quiesce the entry rather than deactivate it. We can only deactivate it
once the device driver has properly stopped the device, at which point it
can call unbind(). There may be other issues but I can't check it
thoroughly until next week.
Thanks,
Jean
Hi, Fenghua
On 2022/4/12 下午9:41, Fenghua Yu wrote:
> Hi, Zhangfei,
>
> On Tue, Apr 12, 2022 at 03:04:09PM +0800, [email protected] wrote:
>>
>> On 2022/4/11 下午10:52, Dave Hansen wrote:
>>> On 4/11/22 07:44, [email protected] wrote:
>>>> On 2022/4/11 下午10:36, Dave Hansen wrote:
>>>>> On 4/11/22 07:20, [email protected] wrote:
>>>>>>> Is there nothing before this call trace? Usually there will be at least
>>>>>>> some warning text.
>>>>>> I added dump_stack() in ioasid_free.
>>>>> Hold on a sec, though...
>>>>>
>>>>> What's the *problem* here? Did something break or are you just saying
>>>>> that something looks weird to _you_?
>>>> After this, nginx is not working at all, and hardware reports error.
>>>> Suppose the the master use the ioasid for init, but got freed.
>>>>
>>>> hardware reports:
>>>> [ 152.731869] hisi_sec2 0000:76:00.0: qm_acc_do_task_timeout [error status=0x20] found
>>>> [ 152.739657] hisi_sec2 0000:76:00.0: qm_acc_wb_not_ready_timeout [error status=0x40] found
>>>> [ 152.747877] hisi_sec2 0000:76:00.0: sec_fsm_hbeat_rint [error status=0x20] found
>>>> [ 152.755340] hisi_sec2 0000:76:00.0: Controller resetting...
>>>> [ 152.762044] hisi_sec2 0000:76:00.0: QM mailbox operation timeout!
>>>> [ 152.768198] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>>>> [ 152.773490] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
>>>> [ 152.781426] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
>>>> [ 152.787468] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>>>> [ 152.792753] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
>>>> [ 152.800685] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
>>>> [ 152.806730] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>>>> [ 152.812017] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
>>>> [ 152.819946] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
>>>> [ 152.825992] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>>> That would have been awfully handy information to have in an initial bug report. :)
>>> Is there a chance you could dump out that ioasid alloc *and* free information in ioasid_alloc/free()? This could be some kind of problem with the allocator, or with copying the ioasid at fork.
>> The issue is nginx master process init resource, start daemon process, then
>> master process quit and free ioasid.
>> The daemon nginx process is not the original master process.
>>
>> master process: init resource
>> driver -> iommu_sva_bind_device -> ioasid_alloc
> Which code in the master process/daemon calls driver->iommu_sva_unbind_device?
Our calling sequence is nginx -> openssl -> openssl engine -> kernel driver
The calling entrence should be ngx_ssl_init : OPENSSL_config(NULL);
nginx:
src/event/ngx_event_openssl.c
ngx_ssl_init
if (OPENSSL_init_ssl(OPENSSL_INIT_LOAD_CONFIG, NULL) == 0)
I add some print.
/usr/local/nginx$ sudo sbin/nginx
ngx_ssl_init pid=2361
bind_fn
ngx_openssl_create_conf pid=2361
hisi sec init Kunpeng920!
ngx_ssl_create pid=2361
ngx_ssl_certificates pid=2361
ngx_ssl_certificate pid=2361
uadk_e_wd_digest_init
hisi sec init Kunpeng920!
ngx_ssl_ciphers pid=2361
ngx_daemon pid=2361 fork daemon
master pid=2361 will exit // here master
process is exit
fork return 0 pid=2364 // here
daemon process started
ngx_daemon fork ngx_pid=2364, ngx_parent=2361
$ ps -aux | grep nginx
root 2364 0.0 0.0 31324 15380 ? Ssl 15:21 0:00
nginx: master process sbin/nginx
nobody 2366 0.0 0.0 32304 16448 ? Sl 15:21 0:00
nginx: worker process
linaro 2371 0.0 0.0 7696 2048 pts/0 S+ 15:22 0:00 grep
--color=auto nginx
nginx
src/os/unix/ngx_daemon.c
ngx_daemon(ngx_log_t *log)
{
int fd;
switch (fork()) {
case -1:
ngx_log_error(NGX_LOG_EMERG, log, ngx_errno, "fork() failed");
return NGX_ERROR;
case 0:
// here fork daemon process
break;
default:
// master process directly exit, and release mm as well as ioasid
exit(0);
}
// only daemon process
ngx_parent = ngx_pid;
ngx_pid = ngx_getpid();
>
>> nginx : ngx_daemon
>> fork daemon, without add mm's refcount.
>>
>> src/os/unix/ngx_daemon.c
>> ngx_daemon(ngx_log_t *log)
>> {
>> int fd;
>>
>> switch (fork()) {
>> case -1:
>> ngx_log_error(NGX_LOG_EMERG, log, ngx_errno, "fork() failed");
>> return NGX_ERROR;
>>
>> case 0: // here master process is quit directly and will be
>> released.
>> break;
>>
>> default:
>> exit(0);
>> }
>> // here daemon process take control.
>> ngx_parent = ngx_pid;
>> ngx_pid = ngx_getpid();
>>
>>
>> fork.c
>> copy_mm
>> if (clone_flags & CLONE_VM) {
>> mmget(oldmm);
>> mm = oldmm;
>> } else {
>> mm = dup_mm(tsk, current->mm); // here daemon process
>> handling without mmget.
>>
>> master process quit, mmput -> mm_pasid_drop->ioasid_free
>> But this ignore driver's iommu_sva_unbind_device function,
>> iommu_sva_bind_device and iommu_sva_unbind_device are not pair, So driver
>> does not know ioasid is freed.
>>
>> Any suggestion?
> ioasid is per process or per mm. A daemon process shouldn't share the same
> ioasid with any other process with even its parent process. Its parent gets
> an ioasid and frees it on exit. The ioasid is gone and shouldn't be used
> by its child process.
>
> Each daemon process should call driver -> iommu_sva_bind_device -> ioasid_alloc
> to get its own ioasid/PASID. On daemon quit, the ioasid is freed.
>
> That means nqnix needs to be changed.
Agree with Dave, I think user space should not be broken.
Thanks
Hi, Zhangfei,
On Tue, Apr 12, 2022 at 03:04:09PM +0800, [email protected] wrote:
>
>
> On 2022/4/11 下午10:52, Dave Hansen wrote:
> > On 4/11/22 07:44, [email protected] wrote:
> > > On 2022/4/11 下午10:36, Dave Hansen wrote:
> > > > On 4/11/22 07:20, [email protected] wrote:
> > > > > > Is there nothing before this call trace? Usually there will be at least
> > > > > > some warning text.
> > > > > I added dump_stack() in ioasid_free.
> > > > Hold on a sec, though...
> > > >
> > > > What's the *problem* here? Did something break or are you just saying
> > > > that something looks weird to _you_?
> > > After this, nginx is not working at all, and hardware reports error.
> > > Suppose the the master use the ioasid for init, but got freed.
> > >
> > > hardware reports:
> > > [ 152.731869] hisi_sec2 0000:76:00.0: qm_acc_do_task_timeout [error status=0x20] found
> > > [ 152.739657] hisi_sec2 0000:76:00.0: qm_acc_wb_not_ready_timeout [error status=0x40] found
> > > [ 152.747877] hisi_sec2 0000:76:00.0: sec_fsm_hbeat_rint [error status=0x20] found
> > > [ 152.755340] hisi_sec2 0000:76:00.0: Controller resetting...
> > > [ 152.762044] hisi_sec2 0000:76:00.0: QM mailbox operation timeout!
> > > [ 152.768198] hisi_sec2 0000:76:00.0: Failed to dump sqc!
> > > [ 152.773490] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
> > > [ 152.781426] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
> > > [ 152.787468] hisi_sec2 0000:76:00.0: Failed to dump sqc!
> > > [ 152.792753] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
> > > [ 152.800685] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
> > > [ 152.806730] hisi_sec2 0000:76:00.0: Failed to dump sqc!
> > > [ 152.812017] hisi_sec2 0000:76:00.0: Failed to drain out data for stopping!
> > > [ 152.819946] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
> > > [ 152.825992] hisi_sec2 0000:76:00.0: Failed to dump sqc!
> > That would have been awfully handy information to have in an initial bug report. :)
> > Is there a chance you could dump out that ioasid alloc *and* free information in ioasid_alloc/free()? This could be some kind of problem with the allocator, or with copying the ioasid at fork.
> The issue is nginx master process init resource, start daemon process, then
> master process quit and free ioasid.
> The daemon nginx process is not the original master process.
>
> master process: init resource
> driver -> iommu_sva_bind_device -> ioasid_alloc
Which code in the master process/daemon calls driver->iommu_sva_unbind_device?
>
> nginx : ngx_daemon
> fork daemon, without add mm's refcount.
>
> src/os/unix/ngx_daemon.c
> ngx_daemon(ngx_log_t *log)
> {
> int fd;
>
> switch (fork()) {
> case -1:
> ngx_log_error(NGX_LOG_EMERG, log, ngx_errno, "fork() failed");
> return NGX_ERROR;
>
> case 0: // here master process is quit directly and will be
> released.
> break;
>
> default:
> exit(0);
> }
> // here daemon process take control.
> ngx_parent = ngx_pid;
> ngx_pid = ngx_getpid();
>
>
> fork.c
> copy_mm
> if (clone_flags & CLONE_VM) {
> mmget(oldmm);
> mm = oldmm;
> } else {
> mm = dup_mm(tsk, current->mm); // here daemon process
> handling without mmget.
>
> master process quit, mmput -> mm_pasid_drop->ioasid_free
> But this ignore driver's iommu_sva_unbind_device function,
> iommu_sva_bind_device and iommu_sva_unbind_device are not pair, So driver
> does not know ioasid is freed.
>
> Any suggestion?
ioasid is per process or per mm. A daemon process shouldn't share the same
ioasid with any other process with even its parent process. Its parent gets
an ioasid and frees it on exit. The ioasid is gone and shouldn't be used
by its child process.
Each daemon process should call driver -> iommu_sva_bind_device -> ioasid_alloc
to get its own ioasid/PASID. On daemon quit, the ioasid is freed.
That means nqnix needs to be changed.
> Or can we still use the original ioasid refcount mechanism?
>
Thanks.
-Fenghua
Hi Dave,
On 2022/4/12 23:35, Dave Hansen wrote:
> On 4/12/22 08:10, Jean-Philippe Brucker wrote:
>>> I wonder if the Intel and ARM IOMMU code differ in the way they keep
>>> references to the mm, or if this affects Intel as well, but we just
>>> haven't tested the code enough.
>> The Arm code was written expecting the PASID to be freed on unbind(), not
>> mm exit. I missed the change of behavior, sorry (I thought your plan was
>> to extend PASID lifetime, not shorten it?) but as is it seems very broken.
>> For example in the iommu_sva_unbind_device(), we have
>> arm_smmu_mmu_notifier_put() clearing the PASID table entry for
>> "mm->pasid", which is going to end badly if the PASID has been cleared or
>> reallocated. We can't clear the PASID entry in mm exit because at that
>> point the device may still be issuing DMA for that PASID and we need to
>> quiesce the entry rather than deactivate it.
>
> I think we ended up flipping some of this around on the Intel side.
> Instead of having to quiesce the device on mm exit, we don't let the mm
> exit until the device is done.
The Intel IOMMU code doesn't quiesce the device on mm exit. It only
tears down the PASID entry so that the subsequent device accesses to mm
is dropped silently.
Just like ARM, Intel IOMMU code also expects that PASID should be freed
and reused after device is done (i.e. after iommu_sva_unbind_device())
so that the PASID could be drained in both hardware and software before
reusing it for other purpose.
>
> When you program the pasid into the device, it's a lot like when you
> create a thread. We bump the reference count on the mm when we program
> the page table pointer into a CPU. We drop the thread's reference to
> the mm when the thread exits and will no longer be using the page tables.
>
> Same thing with pasids. We bump the refcount on the mm when the pasid
> is programmed into the device. Once the device is done with the mm, we
> drop the mm.
>
> Basically, instead of recounting the pasid itself, we just refcount the mm.
Above makes sense to me. It guarantees that the mm->pasid could only be
freed and reused after the device is done.
Best regards,
baolu
On 2022/4/12 下午11:35, [email protected] wrote:
> Hi, Fenghua
>
> On 2022/4/12 下午9:41, Fenghua Yu wrote:
>> Hi, Zhangfei,
>>
>> On Tue, Apr 12, 2022 at 03:04:09PM +0800, [email protected]
>> wrote:
>>>
>>> On 2022/4/11 下午10:52, Dave Hansen wrote:
>>>> On 4/11/22 07:44, [email protected] wrote:
>>>>> On 2022/4/11 下午10:36, Dave Hansen wrote:
>>>>>> On 4/11/22 07:20, [email protected] wrote:
>>>>>>>> Is there nothing before this call trace? Usually there will be
>>>>>>>> at least
>>>>>>>> some warning text.
>>>>>>> I added dump_stack() in ioasid_free.
>>>>>> Hold on a sec, though...
>>>>>>
>>>>>> What's the *problem* here? Did something break or are you just
>>>>>> saying
>>>>>> that something looks weird to _you_?
>>>>> After this, nginx is not working at all, and hardware reports error.
>>>>> Suppose the the master use the ioasid for init, but got freed.
>>>>>
>>>>> hardware reports:
>>>>> [ 152.731869] hisi_sec2 0000:76:00.0: qm_acc_do_task_timeout
>>>>> [error status=0x20] found
>>>>> [ 152.739657] hisi_sec2 0000:76:00.0: qm_acc_wb_not_ready_timeout
>>>>> [error status=0x40] found
>>>>> [ 152.747877] hisi_sec2 0000:76:00.0: sec_fsm_hbeat_rint [error
>>>>> status=0x20] found
>>>>> [ 152.755340] hisi_sec2 0000:76:00.0: Controller resetting...
>>>>> [ 152.762044] hisi_sec2 0000:76:00.0: QM mailbox operation timeout!
>>>>> [ 152.768198] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>>>>> [ 152.773490] hisi_sec2 0000:76:00.0: Failed to drain out data
>>>>> for stopping!
>>>>> [ 152.781426] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
>>>>> [ 152.787468] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>>>>> [ 152.792753] hisi_sec2 0000:76:00.0: Failed to drain out data
>>>>> for stopping!
>>>>> [ 152.800685] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
>>>>> [ 152.806730] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>>>>> [ 152.812017] hisi_sec2 0000:76:00.0: Failed to drain out data
>>>>> for stopping!
>>>>> [ 152.819946] hisi_sec2 0000:76:00.0: QM mailbox is busy to start!
>>>>> [ 152.825992] hisi_sec2 0000:76:00.0: Failed to dump sqc!
>>>> That would have been awfully handy information to have in an
>>>> initial bug report. :)
>>>> Is there a chance you could dump out that ioasid alloc *and* free
>>>> information in ioasid_alloc/free()? This could be some kind of
>>>> problem with the allocator, or with copying the ioasid at fork.
>>> The issue is nginx master process init resource, start daemon
>>> process, then
>>> master process quit and free ioasid.
>>> The daemon nginx process is not the original master process.
>>>
>>> master process: init resource
>>> driver -> iommu_sva_bind_device -> ioasid_alloc
>> Which code in the master process/daemon calls
>> driver->iommu_sva_unbind_device?
> Our calling sequence is nginx -> openssl -> openssl engine -> kernel
> driver
> The calling entrence should be ngx_ssl_init : OPENSSL_config(NULL);
>
> nginx:
> src/event/ngx_event_openssl.c
> ngx_ssl_init
> if (OPENSSL_init_ssl(OPENSSL_INIT_LOAD_CONFIG, NULL) == 0)
>
> I add some print.
>
> /usr/local/nginx$ sudo sbin/nginx
> ngx_ssl_init pid=2361
> bind_fn
> ngx_openssl_create_conf pid=2361
> hisi sec init Kunpeng920!
> ngx_ssl_create pid=2361
> ngx_ssl_certificates pid=2361
> ngx_ssl_certificate pid=2361
> uadk_e_wd_digest_init
> hisi sec init Kunpeng920!
> ngx_ssl_ciphers pid=2361
> ngx_daemon pid=2361 fork daemon
> master pid=2361 will exit // here
> master process is exit
> fork return 0 pid=2364 // here
> daemon process started
> ngx_daemon fork ngx_pid=2364, ngx_parent=2361
> $ ps -aux | grep nginx
> root 2364 0.0 0.0 31324 15380 ? Ssl 15:21 0:00
> nginx: master process sbin/nginx
> nobody 2366 0.0 0.0 32304 16448 ? Sl 15:21 0:00
> nginx: worker process
> linaro 2371 0.0 0.0 7696 2048 pts/0 S+ 15:22 0:00
> grep --color=auto nginx
>
> nginx
> src/os/unix/ngx_daemon.c
> ngx_daemon(ngx_log_t *log)
> {
> int fd;
>
> switch (fork()) {
> case -1:
> ngx_log_error(NGX_LOG_EMERG, log, ngx_errno, "fork() failed");
> return NGX_ERROR;
>
> case 0:
> // here fork daemon process
> break;
>
> default:
> // master process directly exit, and release mm as well as ioasid
> exit(0);
> }
>
> // only daemon process
> ngx_parent = ngx_pid;
> ngx_pid = ngx_getpid();
>>> Any suggestion?
>> ioasid is per process or per mm. A daemon process shouldn't share the
>> same
>> ioasid with any other process with even its parent process. Its
>> parent gets
>> an ioasid and frees it on exit. The ioasid is gone and shouldn't be used
>> by its child process.
>>
>> Each daemon process should call driver -> iommu_sva_bind_device ->
>> ioasid_alloc
>> to get its own ioasid/PASID. On daemon quit, the ioasid is freed.
>>
>> That means nqnix needs to be changed.
>
> Agree with Dave, I think user space should not be broken.
>
> Thanks
Any plan about this regression?
Currently I need this patch to workaround the issue.
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 22ddd05bbdcd..2d74ac53d11c 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -4,6 +4,7 @@
*/
#include <linux/mm.h>
+#include <linux/sched/mm.h>
#include <linux/mmu_context.h>
#include <linux/mmu_notifier.h>
#include <linux/slab.h>
@@ -363,6 +364,7 @@ arm_smmu_sva_bind(struct device *dev, struct
mm_struct *mm, void *drvdata)
mutex_lock(&sva_lock);
handle = __arm_smmu_sva_bind(dev, mm);
+ mmget(mm);
mutex_unlock(&sva_lock);
return handle;
}
@@ -377,6 +379,7 @@ void arm_smmu_sva_unbind(struct iommu_sva *handle)
arm_smmu_mmu_notifier_put(bond->smmu_mn);
kfree(bond);
}
+ mmput(bond->mm);
mutex_unlock(&sva_lock);
}
Thanks
Hi, Dave,
On Tue, Apr 12, 2022 at 07:39:10AM -0700, Dave Hansen wrote:
> On 4/12/22 06:41, Fenghua Yu wrote:
> >> master process quit, mmput ->? mm_pasid_drop->ioasid_free
> >> But this ignore driver's iommu_sva_unbind_device function,
> >> iommu_sva_bind_device and iommu_sva_unbind_device are not pair,? So driver
> >> does not know ioasid is freed.
> >>
> >> Any suggestion?
> > ioasid is per process or per mm. A daemon process shouldn't share the same
> > ioasid with any other process with even its parent process. Its parent gets
> > an ioasid and frees it on exit. The ioasid is gone and shouldn't be used
> > by its child process.
> >
> > Each daemon process should call driver -> iommu_sva_bind_device -> ioasid_alloc
> > to get its own ioasid/PASID. On daemon quit, the ioasid is freed.
> >
> > That means nqnix needs to be changed.
>
> Fenghua, please step back for a second and look at what you are saying.
> Your patch caused userspace to break. Now, you're telling someone that
> they need to go change that userspace to work around something that your
> patch. How, exactly, are you suggesting that nginx could change to fix
> this? What, specifically, was it doing with *fork()* that was wrong?
>
> It sounds to me like you're saying that it's OK to break userspace.
You are right. The patch should not break userspace. I follow your
suggestion to fix the issue by mmget() in binding and mmput() in unbinding.
The RFC patch was sent out in another thread. Please review it.
Thank you very much for your advice.
-Fenghua
On Thu, Apr 14, 2022 at 06:08:09PM +0800, [email protected] wrote:
>
> On 2022/4/12 下午11:35, [email protected] wrote:
> > Hi, Fenghua
> >
> > On 2022/4/12 下午9:41, Fenghua Yu wrote:
> > > Hi, Zhangfei,
> > >
> > > On Tue, Apr 12, 2022 at 03:04:09PM +0800, [email protected]
> > > wrote:
> > > >
> > > > On 2022/4/11 下午10:52, Dave Hansen wrote:
> > > > > On 4/11/22 07:44, [email protected] wrote:
> > > > > > On 2022/4/11 下午10:36, Dave Hansen wrote:
> > > > > > > On 4/11/22 07:20, [email protected] wrote:
> > Agree with Dave, I think user space should not be broken.
> >
> > Thanks
>
> Any plan about this regression?
> Currently I need this patch to workaround the issue.
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> index 22ddd05bbdcd..2d74ac53d11c 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> @@ -4,6 +4,7 @@
> */
>
> #include <linux/mm.h>
> +#include <linux/sched/mm.h>
> #include <linux/mmu_context.h>
> #include <linux/mmu_notifier.h>
> #include <linux/slab.h>
> @@ -363,6 +364,7 @@ arm_smmu_sva_bind(struct device *dev, struct mm_struct
> *mm, void *drvdata)
>
> mutex_lock(&sva_lock);
> handle = __arm_smmu_sva_bind(dev, mm);
> + mmget(mm);
> mutex_unlock(&sva_lock);
> return handle;
> }
> @@ -377,6 +379,7 @@ void arm_smmu_sva_unbind(struct iommu_sva *handle)
> arm_smmu_mmu_notifier_put(bond->smmu_mn);
> kfree(bond);
> }
> + mmput(bond->mm);
> mutex_unlock(&sva_lock);
> }
Could you please review and/or test this patch? It's supposed to fix
the PASID issue on both ARM and X86.
From a6444e1e5bd8076f5e5c5e950d3192de327f0c9c Mon Sep 17 00:00:00 2001
From: Fenghua Yu <[email protected]>
Date: Fri, 15 Apr 2022 00:51:33 -0700
Subject: [RFC PATCH] iommu/sva: Fix PASID use-after-free issue
A PASID might be still used even though it is freed on mm exit.
process A:
sva_bind();
ioasid_alloc() = N; // Get PASID N for the mm
fork(): // spawn process B
exit();
ioasid_free(N);
process B:
device uses PASID N -> failure
sva_unbind();
Dave Hansen suggests to take a refcount on the mm whenever binding the
PASID to a device and drop the refcount on unbinding. The mm won't be
dropped if the PASID is still bound to it.
Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm exit")
Reported-by: Zhangfei Gao <[email protected]>
Suggested-by: Dave Hansen" <[email protected]>
Signed-off-by: Fenghua Yu <[email protected]>
---
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 6 ++++++
drivers/iommu/intel/svm.c | 4 ++++
2 files changed, 10 insertions(+)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 22ddd05bbdcd..3fcb842a0df0 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -7,6 +7,7 @@
#include <linux/mmu_context.h>
#include <linux/mmu_notifier.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include "arm-smmu-v3.h"
#include "../../iommu-sva-lib.h"
@@ -363,6 +364,9 @@ arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
mutex_lock(&sva_lock);
handle = __arm_smmu_sva_bind(dev, mm);
+ /* Take an mm refcount on a successful bind. */
+ if (!IS_ERR(handle))
+ mmget(mm);
mutex_unlock(&sva_lock);
return handle;
}
@@ -372,6 +376,8 @@ void arm_smmu_sva_unbind(struct iommu_sva *handle)
struct arm_smmu_bond *bond = sva_to_bond(handle);
mutex_lock(&sva_lock);
+ /* Drop an mm refcount. */
+ mmput(bond->mm);
if (refcount_dec_and_test(&bond->refs)) {
list_del(&bond->list);
arm_smmu_mmu_notifier_put(bond->smmu_mn);
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 23a38763c1d1..345a0d5d7922 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -403,6 +403,8 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
goto free_sdev;
list_add_rcu(&sdev->list, &svm->devs);
+ /* Take an mm refcount on binding mm. */
+ mmget(mm);
success:
return &sdev->sva;
@@ -465,6 +467,8 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
kfree(svm);
}
}
+ /* Drop an mm reference on unbinding mm. */
+ mmput(mm);
}
out:
return ret;
--
2.32.0
On 2022/4/15 下午6:50, Fenghua Yu wrote:
> Hi, Zhangfei,
>
> On Fri, Apr 15, 2022 at 06:14:09PM +0800, [email protected] wrote:
>>
>> On 2022/4/15 下午5:51, Fenghua Yu wrote:
>>> On Thu, Apr 14, 2022 at 06:08:09PM +0800, [email protected] wrote:
>>>> On 2022/4/12 下午11:35, [email protected] wrote:
>>>>> Hi, Fenghua
>>>>>
>>>>> On 2022/4/12 下午9:41, Fenghua Yu wrote:
>>> From a6444e1e5bd8076f5e5c5e950d3192de327f0c9c Mon Sep 17 00:00:00 2001
>>> From: Fenghua Yu <[email protected]>
>>> Date: Fri, 15 Apr 2022 00:51:33 -0700
>>> Subject: [RFC PATCH] iommu/sva: Fix PASID use-after-free issue
>>>
>>> A PASID might be still used even though it is freed on mm exit.
>>>
>>> process A:
>>> sva_bind();
>>> ioasid_alloc() = N; // Get PASID N for the mm
>>> fork(): // spawn process B
>>> exit();
>>> ioasid_free(N);
>>>
>>> process B:
>>> device uses PASID N -> failure
>>> sva_unbind();
>>>
>>> Dave Hansen suggests to take a refcount on the mm whenever binding the
>>> PASID to a device and drop the refcount on unbinding. The mm won't be
>>> dropped if the PASID is still bound to it.
>>>
>>> Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm exit")
>>>
>>> Reported-by: Zhangfei Gao <[email protected]>
>>> Suggested-by: Dave Hansen" <[email protected]>
>>> Signed-off-by: Fenghua Yu <[email protected]>
>>> ---
>>> drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 6 ++++++
>>> drivers/iommu/intel/svm.c | 4 ++++
>>> 2 files changed, 10 insertions(+)
>>>
>>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>>> index 22ddd05bbdcd..3fcb842a0df0 100644
>>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>>> @@ -7,6 +7,7 @@
>>> #include <linux/mmu_context.h>
>>> #include <linux/mmu_notifier.h>
>>> #include <linux/slab.h>
>>> +#include <linux/sched/mm.h>
>>> #include "arm-smmu-v3.h"
>>> #include "../../iommu-sva-lib.h"
>>> @@ -363,6 +364,9 @@ arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
>>> mutex_lock(&sva_lock);
>>> handle = __arm_smmu_sva_bind(dev, mm);
>>> + /* Take an mm refcount on a successful bind. */
>>> + if (!IS_ERR(handle))
>>> + mmget(mm);
>>> mutex_unlock(&sva_lock);
>>> return handle;
>>> }
>>> @@ -372,6 +376,8 @@ void arm_smmu_sva_unbind(struct iommu_sva *handle)
>>> struct arm_smmu_bond *bond = sva_to_bond(handle);
>>> mutex_lock(&sva_lock);
>>> + /* Drop an mm refcount. */
>>> + mmput(bond->mm);
>>> if (refcount_dec_and_test(&bond->refs)) {
>>> list_del(&bond->list);
>>> arm_smmu_mmu_notifier_put(bond->smmu_mn);
>>> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
>>> index 23a38763c1d1..345a0d5d7922 100644
>>> --- a/drivers/iommu/intel/svm.c
>>> +++ b/drivers/iommu/intel/svm.c
>>> @@ -403,6 +403,8 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
>>> goto free_sdev;
>>> list_add_rcu(&sdev->list, &svm->devs);
>>> + /* Take an mm refcount on binding mm. */
>>> + mmget(mm);
>>> success:
>>> return &sdev->sva;
>>> @@ -465,6 +467,8 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
>>> kfree(svm);
>>> }
>>> }
>>> + /* Drop an mm reference on unbinding mm. */
>>> + mmput(mm);
>>> }
>>> out:
>>> return ret;
>> This patch can not be applied on 5.18-rc2 for intel part.
> What error do you see? Could you please send to me errors?
>
> I download this patch from:
> https://lore.kernel.org/lkml/[email protected]/raw
> git am to either v5.18-rc2 or the latest upstream without any issue.
It is my copy paste issue.
I have tested, nginx woks well.
Other than the following issue,
Each time /sbin/nginx will alloc ioasid but not free.
which I think it maybe nginx issue or the mis-usage, will ask there.
Tested-by: Zhangfei Gao <[email protected]>
>
>> It should work for arm.
>>
>> In fact I have a similar patch at hand but pending since I found an issue.
>>
>> I start & stop nginx via this cmd.
>> //start
>> sudo sbin/nginx // this alloc an ioasid=1
>> //stop
>> sudo sbin/nginx -s quit // this does not free ioasid=1, but still alloc
>> ioasid=2.
>> So ioasid will keep allocated but not freed if continue start/stop nginx,
>> though not impact the nginx function.
>>
>> stop nginx with -s quit still calls
>> src/core/nginx.c
>> main -> ngx_ssl_init -> openssl engine: bind_fn -> ... -> alloc asid
>> But openssl engine: ENGINE_free is not called
>>
>> Still in checking nginx code.
>>
>> Or do you test with nginx?
> On my X86 machine, nginx doesn't trigger the kernel sva binding function
> to allocate ioasid. I tried pre- nstalled nginx/openssl and also tried my built
> a few versions of nginx/openssl. nginx does call OPENSSL_init_ssl() but
> doesn't go to the binding function. Don't know if it's my configuration issue.
> Maybe you can give me some advice?
I am using openssl engine, which use crypto driver and using sva via uacce.
nginx -> openssl -> openssl engine -> sva related.
>
> I test the patch with a few internal test tools and observe mmget()/mmput()
> works fine in various cases.
OK, thanks
Hi, Zhangfei,
On Fri, Apr 15, 2022 at 07:52:03PM +0800, [email protected] wrote:
>
>
> On 2022/4/15 下午6:50, Fenghua Yu wrote:
> > Hi, Zhangfei,
> >
> > On Fri, Apr 15, 2022 at 06:14:09PM +0800, [email protected] wrote:
> > I download this patch from:
> > https://lore.kernel.org/lkml/[email protected]/raw
> > git am to either v5.18-rc2 or the latest upstream without any issue.
> It is my copy paste issue.
>
> I have tested, nginx woks well.
Great!
>
> Other than the following issue,
> Each time /sbin/nginx will alloc ioasid but not free.
> which I think it maybe nginx issue or the mis-usage, will ask there.
Which nginx/openssl function is supposed to call kernel sva_unbind? I couldn't
find the function in nginx tree.
If nginx doesn't free ioasid, it will cause ioasid leak and memory leak.
>
> Tested-by: Zhangfei Gao <[email protected]>
Thank you for your testing!
>
> >
> > > It should work for arm.
> > >
> > > In fact I have a similar patch at hand but pending since I found an issue.
> > >
> > > I start & stop nginx via this cmd.
> > > //start
> > > sudo sbin/nginx // this alloc an ioasid=1
> > > //stop
> > > sudo sbin/nginx -s quit // this does not free ioasid=1, but still alloc
> > > ioasid=2.
> > > So ioasid will keep allocated but not freed if continue start/stop nginx,
> > > though not impact the nginx function.
> > >
> > > stop nginx with -s quit still calls
> > > src/core/nginx.c
> > > main -> ngx_ssl_init -> openssl engine: bind_fn -> ... -> alloc asid
> > > But openssl engine: ENGINE_free is not called
> > >
> > > Still in checking nginx code.
> > >
> > > Or do you test with nginx?
> > On my X86 machine, nginx doesn't trigger the kernel sva binding function
> > to allocate ioasid. I tried pre- nstalled nginx/openssl and also tried my built
> > a few versions of nginx/openssl. nginx does call OPENSSL_init_ssl() but
> > doesn't go to the binding function. Don't know if it's my configuration issue.
> > Maybe you can give me some advice?
> I am using openssl engine, which use crypto driver and using sva via uacce.
> nginx -> openssl -> openssl engine -> sva related.
I'll do more nginx experiments.
>
> >
> > I test the patch with a few internal test tools and observe mmget()/mmput()
> > works fine in various cases.
> OK, thanks
Thank you very much!
-Fenghua
Hi [email protected],
On Fri, 15 Apr 2022 19:52:03 +0800, "[email protected]"
<[email protected]> wrote:
> >>> A PASID might be still used even though it is freed on mm exit.
> >>>
> >>> process A:
> >>> sva_bind();
> >>> ioasid_alloc() = N; // Get PASID N for the mm
> >>> fork(): // spawn process B
> >>> exit();
> >>> ioasid_free(N);
> >>>
> >>> process B:
> >>> device uses PASID N -> failure
> >>> sva_unbind();
> >>>
> >>> Dave Hansen suggests to take a refcount on the mm whenever binding the
> >>> PASID to a device and drop the refcount on unbinding. The mm won't be
> >>> dropped if the PASID is still bound to it.
> >>>
> >>> Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID
> >>> allocation and free it on mm exit")
> >>>
Is process A's mm intended to be used by process B? Or you really should
use PASID N on process B's mm? If the latter, it may work for a while until
B changes mapping.
It seems you are just extending the life of a defunct mm?
Thanks,
Jacob
Hi, Zhangfei,
On Fri, Apr 15, 2022 at 06:14:09PM +0800, [email protected] wrote:
>
>
> On 2022/4/15 下午5:51, Fenghua Yu wrote:
> > On Thu, Apr 14, 2022 at 06:08:09PM +0800, [email protected] wrote:
> > > On 2022/4/12 下午11:35, [email protected] wrote:
> > > > Hi, Fenghua
> > > >
> > > > On 2022/4/12 下午9:41, Fenghua Yu wrote:
> > From a6444e1e5bd8076f5e5c5e950d3192de327f0c9c Mon Sep 17 00:00:00 2001
> > From: Fenghua Yu <[email protected]>
> > Date: Fri, 15 Apr 2022 00:51:33 -0700
> > Subject: [RFC PATCH] iommu/sva: Fix PASID use-after-free issue
> >
> > A PASID might be still used even though it is freed on mm exit.
> >
> > process A:
> > sva_bind();
> > ioasid_alloc() = N; // Get PASID N for the mm
> > fork(): // spawn process B
> > exit();
> > ioasid_free(N);
> >
> > process B:
> > device uses PASID N -> failure
> > sva_unbind();
> >
> > Dave Hansen suggests to take a refcount on the mm whenever binding the
> > PASID to a device and drop the refcount on unbinding. The mm won't be
> > dropped if the PASID is still bound to it.
> >
> > Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm exit")
> >
> > Reported-by: Zhangfei Gao <[email protected]>
> > Suggested-by: Dave Hansen" <[email protected]>
> > Signed-off-by: Fenghua Yu <[email protected]>
> > ---
> > drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 6 ++++++
> > drivers/iommu/intel/svm.c | 4 ++++
> > 2 files changed, 10 insertions(+)
> >
> > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> > index 22ddd05bbdcd..3fcb842a0df0 100644
> > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> > @@ -7,6 +7,7 @@
> > #include <linux/mmu_context.h>
> > #include <linux/mmu_notifier.h>
> > #include <linux/slab.h>
> > +#include <linux/sched/mm.h>
> > #include "arm-smmu-v3.h"
> > #include "../../iommu-sva-lib.h"
> > @@ -363,6 +364,9 @@ arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
> > mutex_lock(&sva_lock);
> > handle = __arm_smmu_sva_bind(dev, mm);
> > + /* Take an mm refcount on a successful bind. */
> > + if (!IS_ERR(handle))
> > + mmget(mm);
> > mutex_unlock(&sva_lock);
> > return handle;
> > }
> > @@ -372,6 +376,8 @@ void arm_smmu_sva_unbind(struct iommu_sva *handle)
> > struct arm_smmu_bond *bond = sva_to_bond(handle);
> > mutex_lock(&sva_lock);
> > + /* Drop an mm refcount. */
> > + mmput(bond->mm);
> > if (refcount_dec_and_test(&bond->refs)) {
> > list_del(&bond->list);
> > arm_smmu_mmu_notifier_put(bond->smmu_mn);
> > diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
> > index 23a38763c1d1..345a0d5d7922 100644
> > --- a/drivers/iommu/intel/svm.c
> > +++ b/drivers/iommu/intel/svm.c
> > @@ -403,6 +403,8 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
> > goto free_sdev;
> > list_add_rcu(&sdev->list, &svm->devs);
> > + /* Take an mm refcount on binding mm. */
> > + mmget(mm);
> > success:
> > return &sdev->sva;
> > @@ -465,6 +467,8 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
> > kfree(svm);
> > }
> > }
> > + /* Drop an mm reference on unbinding mm. */
> > + mmput(mm);
> > }
> > out:
> > return ret;
> This patch can not be applied on 5.18-rc2 for intel part.
What error do you see? Could you please send to me errors?
I download this patch from:
https://lore.kernel.org/lkml/[email protected]/raw
git am to either v5.18-rc2 or the latest upstream without any issue.
> It should work for arm.
>
> In fact I have a similar patch at hand but pending since I found an issue.
>
> I start & stop nginx via this cmd.
> //start
> sudo sbin/nginx // this alloc an ioasid=1
> //stop
> sudo sbin/nginx -s quit // this does not free ioasid=1, but still alloc
> ioasid=2.
> So ioasid will keep allocated but not freed if continue start/stop nginx,
> though not impact the nginx function.
>
> stop nginx with -s quit still calls
> src/core/nginx.c
> main -> ngx_ssl_init -> openssl engine: bind_fn -> ... -> alloc asid
> But openssl engine: ENGINE_free is not called
>
> Still in checking nginx code.
>
> Or do you test with nginx?
On my X86 machine, nginx doesn't trigger the kernel sva binding function
to allocate ioasid. I tried pre- nstalled nginx/openssl and also tried my built
a few versions of nginx/openssl. nginx does call OPENSSL_init_ssl() but
doesn't go to the binding function. Don't know if it's my configuration issue.
Maybe you can give me some advice?
I test the patch with a few internal test tools and observe mmget()/mmput()
works fine in various cases.
Thanks.
-Fenghua
Hi, Zhangfei,
On Fri, Apr 15, 2022 at 07:52:03PM +0800, [email protected] wrote:
> > On my X86 machine, nginx doesn't trigger the kernel sva binding function
> > to allocate ioasid. I tried pre- nstalled nginx/openssl and also tried my built
> > a few versions of nginx/openssl. nginx does call OPENSSL_init_ssl() but
> > doesn't go to the binding function. Don't know if it's my configuration issue.
> > Maybe you can give me some advice?
> I am using openssl engine, which use crypto driver and using sva via uacce.
> nginx -> openssl -> openssl engine -> sva related.
uacce is not used on X86. That's why I cannot test IOASID/PASID by nginx
on X86.
I only can test the RFC patch by other test tools via IDXD driver which uses
PASID on X86.
Thanks.
-Fenghua
On 2022/4/15 下午8:37, Fenghua Yu wrote:
> Hi, Zhangfei,
>
> On Fri, Apr 15, 2022 at 07:52:03PM +0800, [email protected] wrote:
>>
>> On 2022/4/15 下午6:50, Fenghua Yu wrote:
>>> Hi, Zhangfei,
>>>
>>> On Fri, Apr 15, 2022 at 06:14:09PM +0800, [email protected] wrote:
>>> I download this patch from:
>>> https://lore.kernel.org/lkml/[email protected]/raw
>>> git am to either v5.18-rc2 or the latest upstream without any issue.
>> It is my copy paste issue.
>>
>> I have tested, nginx woks well.
> Great!
>
>> Other than the following issue,
>> Each time /sbin/nginx will alloc ioasid but not free.
>> which I think it maybe nginx issue or the mis-usage, will ask there.
> Which nginx/openssl function is supposed to call kernel sva_unbind? I couldn't
> find the function in nginx tree.
>
> If nginx doesn't free ioasid, it will cause ioasid leak and memory leak.
Yes
In my case, sva_bind/unbind is from openssl_engine, bind_fn, not in
nginx itself
nginx will use openssl -> openssl engine.
nginx:
src/core/nginx.c
main
ngx_ssl_init(log);
OPENSSL_init_ssl(OPENSSL_INIT_LOAD_CONFIG, NULL)
openssl_engine
IMPLEMENT_DYNAMIC_BIND_FN(bind_fn)
bind_fn
sva_bind
destroy
sva_unbind
But destroy seems not called in sbin/nginx -s quit.
Thanks
On 2022/4/16 上午5:00, Jacob Pan wrote:
> Hi [email protected],
>
> On Fri, 15 Apr 2022 19:52:03 +0800, "[email protected]"
> <[email protected]> wrote:
>
>>>>> A PASID might be still used even though it is freed on mm exit.
>>>>>
>>>>> process A:
>>>>> sva_bind();
>>>>> ioasid_alloc() = N; // Get PASID N for the mm
>>>>> fork(): // spawn process B
>>>>> exit();
>>>>> ioasid_free(N);
>>>>>
>>>>> process B:
>>>>> device uses PASID N -> failure
>>>>> sva_unbind();
>>>>>
>>>>> Dave Hansen suggests to take a refcount on the mm whenever binding the
>>>>> PASID to a device and drop the refcount on unbinding. The mm won't be
>>>>> dropped if the PASID is still bound to it.
>>>>>
>>>>> Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID
>>>>> allocation and free it on mm exit")
>>>>>
> Is process A's mm intended to be used by process B? Or you really should
> use PASID N on process B's mm? If the latter, it may work for a while until
> B changes mapping.
>
> It seems you are just extending the life of a defunct mm?
From nginx code, the master process init resources, then fork daemon
process to take over,
then master process exit by itself.
src/core/nginx.c
main
ngx_ssl_init(log); -> openssl engine -> bind_fn -> sva_bind()
ngx_daemon(cycle->log)
src/os/unix/ngx_daemon.c
ngx_daemon(ngx_log_t *log)
{
int fd;
switch (fork()) {
case -1:
ngx_log_error(NGX_LOG_EMERG, log, ngx_errno, "fork() failed");
return NGX_ERROR;
case 0:
// the fork daemon process
break;
default:
// master process directly exit, and release mm as well as ioasid
exit(0);
}
// only daemon process
Thanks
>
> Thanks,
>
> Jacob
> From: Jacob Pan <[email protected]>
> Sent: Saturday, April 16, 2022 5:00 AM
>
> Hi [email protected],
>
> On Fri, 15 Apr 2022 19:52:03 +0800, "[email protected]"
> <[email protected]> wrote:
>
> > >>> A PASID might be still used even though it is freed on mm exit.
> > >>>
> > >>> process A:
> > >>> sva_bind();
> > >>> ioasid_alloc() = N; // Get PASID N for the mm
> > >>> fork(): // spawn process B
> > >>> exit();
> > >>> ioasid_free(N);
> > >>>
> > >>> process B:
> > >>> device uses PASID N -> failure
> > >>> sva_unbind();
> > >>>
> > >>> Dave Hansen suggests to take a refcount on the mm whenever binding
> the
> > >>> PASID to a device and drop the refcount on unbinding. The mm won't
> be
> > >>> dropped if the PASID is still bound to it.
> > >>>
> > >>> Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID
> > >>> allocation and free it on mm exit")
> > >>>
> Is process A's mm intended to be used by process B? Or you really should
> use PASID N on process B's mm? If the latter, it may work for a while until
> B changes mapping.
>
> It seems you are just extending the life of a defunct mm?
>
IMHO the intention is not to allow B to access A's mm.
The problem is that PASID N is released on exit() of A and then
reallocated to B before iommu driver gets the chance to quiesce
the device and clear the PASID entry. According to the discussion
the quiesce operation must be done when driver calls unbind()
instead of in mm exit. In this case a failure is reported when
B tries to call bind() on PASID N due to an already-present entry.
Dave's patch extending the life of A's mm until unbind() is called.
With it B either gets a different PASID before A's unbind() is
completed or same PASID N pointing to B's mm after A's unbind().
Thanks
Kevin
Hi [email protected],
On Sat, 16 Apr 2022 09:43:07 +0800, "[email protected]"
<[email protected]> wrote:
> On 2022/4/16 上午5:00, Jacob Pan wrote:
> > Hi [email protected],
> >
> > On Fri, 15 Apr 2022 19:52:03 +0800, "[email protected]"
> > <[email protected]> wrote:
> >
> >>>>> A PASID might be still used even though it is freed on mm exit.
> >>>>>
> >>>>> process A:
> >>>>> sva_bind();
> >>>>> ioasid_alloc() = N; // Get PASID N for the mm
> >>>>> fork(): // spawn process B
> >>>>> exit();
> >>>>> ioasid_free(N);
> >>>>>
> >>>>> process B:
> >>>>> device uses PASID N -> failure
> >>>>> sva_unbind();
> >>>>>
> >>>>> Dave Hansen suggests to take a refcount on the mm whenever binding
> >>>>> the PASID to a device and drop the refcount on unbinding. The mm
> >>>>> won't be dropped if the PASID is still bound to it.
> >>>>>
> >>>>> Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID
> >>>>> allocation and free it on mm exit")
> >>>>>
> > Is process A's mm intended to be used by process B? Or you really should
> > use PASID N on process B's mm? If the latter, it may work for a while
> > until B changes mapping.
> >
> > It seems you are just extending the life of a defunct mm?
>
> From nginx code, the master process init resources, then fork daemon
> process to take over,
> then master process exit by itself.
>
> src/core/nginx.c
> main
> ngx_ssl_init(log); -> openssl engine -> bind_fn -> sva_bind()
> ngx_daemon(cycle->log)
>
> src/os/unix/ngx_daemon.c
> ngx_daemon(ngx_log_t *log)
> {
> int fd;
>
> switch (fork()) {
> case -1:
> ngx_log_error(NGX_LOG_EMERG, log, ngx_errno, "fork() failed");
> return NGX_ERROR;
>
> case 0:
> // the fork daemon process
> break;
>
Does this child process call sva_bind() again to get another PASID? Or it
will keep using the parent's PASID for DMA?
> default:
> // master process directly exit, and release mm as well as ioasid
> exit(0);
> }
>
> // only daemon process
>
> Thanks
>
> >
> > Thanks,
> >
> > Jacob
>
Thanks,
Jacob
On 2022/4/19 上午2:14, Jacob Pan wrote:
> Hi [email protected],
>
> On Sat, 16 Apr 2022 09:43:07 +0800, "[email protected]"
> <[email protected]> wrote:
>
>> On 2022/4/16 上午5:00, Jacob Pan wrote:
>>> Hi [email protected],
>>>
>>> On Fri, 15 Apr 2022 19:52:03 +0800, "[email protected]"
>>> <[email protected]> wrote:
>>>
>>>>>>> A PASID might be still used even though it is freed on mm exit.
>>>>>>>
>>>>>>> process A:
>>>>>>> sva_bind();
>>>>>>> ioasid_alloc() = N; // Get PASID N for the mm
>>>>>>> fork(): // spawn process B
>>>>>>> exit();
>>>>>>> ioasid_free(N);
>>>>>>>
>>>>>>> process B:
>>>>>>> device uses PASID N -> failure
>>>>>>> sva_unbind();
>>>>>>>
>>>>>>> Dave Hansen suggests to take a refcount on the mm whenever binding
>>>>>>> the PASID to a device and drop the refcount on unbinding. The mm
>>>>>>> won't be dropped if the PASID is still bound to it.
>>>>>>>
>>>>>>> Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID
>>>>>>> allocation and free it on mm exit")
>>>>>>>
>>> Is process A's mm intended to be used by process B? Or you really should
>>> use PASID N on process B's mm? If the latter, it may work for a while
>>> until B changes mapping.
>>>
>>> It seems you are just extending the life of a defunct mm?
>> From nginx code, the master process init resources, then fork daemon
>> process to take over,
>> then master process exit by itself.
>>
>> src/core/nginx.c
>> main
>> ngx_ssl_init(log); -> openssl engine -> bind_fn -> sva_bind()
>> ngx_daemon(cycle->log)
>>
>> src/os/unix/ngx_daemon.c
>> ngx_daemon(ngx_log_t *log)
>> {
>> int fd;
>>
>> switch (fork()) {
>> case -1:
>> ngx_log_error(NGX_LOG_EMERG, log, ngx_errno, "fork() failed");
>> return NGX_ERROR;
>>
>> case 0:
>> // the fork daemon process
>> break;
>>
> Does this child process call sva_bind() again to get another PASID? Or it
> will keep using the parent's PASID for DMA?
The master process call sva_bind (PASID A), fork daemon process, then exit.
The daemon process does not call sva_bind again, only for managing
worker processes.
The worker process will call sva_bind for new PASID (B), for real
transaction.
The worker process will free the PASID (B) when worker process exit like
nginx quit.
nginx -s quit does not free PASID A via callback, which may should be
freed by signal handler in engine itself, still in check.
Thanks
Hi Kevin,
On Mon, 18 Apr 2022 06:34:19 +0000, "Tian, Kevin" <[email protected]>
wrote:
> > From: Jacob Pan <[email protected]>
> > Sent: Saturday, April 16, 2022 5:00 AM
> >
> > Hi [email protected],
> >
> > On Fri, 15 Apr 2022 19:52:03 +0800, "[email protected]"
> > <[email protected]> wrote:
> >
> > > >>> A PASID might be still used even though it is freed on mm exit.
> > > >>>
> > > >>> process A:
> > > >>> sva_bind();
> > > >>> ioasid_alloc() = N; // Get PASID N for the mm
> > > >>> fork(): // spawn process B
> > > >>> exit();
> > > >>> ioasid_free(N);
> > > >>>
> > > >>> process B:
> > > >>> device uses PASID N -> failure
> > > >>> sva_unbind();
> > > >>>
> > > >>> Dave Hansen suggests to take a refcount on the mm whenever
> > > >>> binding
> > the
> > > >>> PASID to a device and drop the refcount on unbinding. The mm
> > > >>> won't
> > be
> > > >>> dropped if the PASID is still bound to it.
> > > >>>
> > > >>> Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID
> > > >>> allocation and free it on mm exit")
> > > >>>
> > Is process A's mm intended to be used by process B? Or you really should
> > use PASID N on process B's mm? If the latter, it may work for a while
> > until B changes mapping.
> >
> > It seems you are just extending the life of a defunct mm?
> >
>
> IMHO the intention is not to allow B to access A's mm.
>
> The problem is that PASID N is released on exit() of A and then
> reallocated to B before iommu driver gets the chance to quiesce
> the device and clear the PASID entry. According to the discussion
> the quiesce operation must be done when driver calls unbind()
> instead of in mm exit. In this case a failure is reported when
> B tries to call bind() on PASID N due to an already-present entry.
>
> Dave's patch extending the life of A's mm until unbind() is called.
> With it B either gets a different PASID before A's unbind() is
> completed or same PASID N pointing to B's mm after A's unbind().
>
As long as B gets a different PASID, that is fine. It seems PASID N has no
use then.
> Thanks
> Kevin
Thanks,
Jacob
On Fri, Apr 22, 2022 at 05:03:10PM +0800, [email protected] wrote:
[...]
> > Have tested, still got some issue with our openssl-engine.
> >
> > 1. If openssl-engine does not register rsa, nginx works well.
> >
> > 2. If openssl-engine register rsa, nginx also works, but ioasid is not
> > freed when nginx stop.
> >
> > IMPLEMENT_DYNAMIC_BIND_FN(bind_fn)
> > bind_fn
> > ENGINE_set_RSA(e, rsa_methods())
> >
> > destroy_fn
> >
> > If ENGINE_set_RSA is set, nginx start and stop will NOT call destroy_fn.
> > Even rsa_methods is almost new via RSA_meth_new.
> >
> > In 5.18-rcx, this caused ioasid not freed in nginx start and stop.
> > In 5.17, though destroy_fn is not called, but ioasid is freed when nginx
> > stop, so not noticed this issue before.
>
> 1. uacce_fops_release
> In 5.16 or 5.17
> In fact, we aslo has the issue: openssl engine does not call destroy_fn ->
> close(uacce_fd)
> But system will automatically close all opened fd,
> so uacce_fops_release is also called and free ioasid.
>
> Have one experiment, not call close fd
>
> log: open uacce fd but no close
> [ 2583.471225] dump_backtrace+0x0/0x1a0
> [ 2583.474876] show_stack+0x20/0x30
> [ 2583.478178] dump_stack_lvl+0x8c/0xb8
> [ 2583.481825] dump_stack+0x18/0x34
> [ 2583.485126] uacce_fops_release+0x44/0xdc
> [ 2583.489117] __fput+0x78/0x240
> [ 2583.492159] ____fput+0x18/0x28
> [ 2583.495288] task_work_run+0x88/0x160
> [ 2583.498936] do_notify_resume+0x214/0x490
> [ 2583.502927] el0_svc+0x58/0x70
> [ 2583.505968] el0t_64_sync_handler+0xb0/0xb8
> [ 2583.510132] el0t_64_sync+0x1a0/0x1a4
> [ 2583.582292] uacce_fops_release q=00000000d6674128
>
> In 5.18, since refcount was add.
> The opened uacce fd was not closed automatically by system
> So we see the issue.
>
> log: open uacce fd but no close
> [ 106.360140] uacce_fops_open q=00000000ccc38d74
> [ 106.364929] ioasid_alloc ioasid=1
> [ 106.368585] iommu_sva_alloc_pasid pasid=1
> [ 106.372943] iommu_sva_bind_device handle=000000006cca298a
> // ioasid is not free
I'm trying to piece together what happens from the kernel point of view.
* master process with mm A opens a queue fd through uacce, which calls
iommu_sva_bind_device(dev, A) -> PASID 1
* master forks and exits. Child (daemon) gets mm B, inherits the queue fd.
The device is still bound to mm A with PASID 1, since the queue fd is
still open.
We discussed this before, but I don't remember where we left off. The
child can't use the queue because its mappings are not copied on fork(),
and the queue is still bound to the parent mm A. The child either needs to
open a new queue or take ownership of the old one with a new uacce ioctl.
Is that the "IMPLEMENT_DYNAMIC_BIND_FN()" you mention, something out of
tree? This operation should unbind from A before binding to B, no?
Otherwise we leak PASID 1.
Thanks,
Jean
Hi, Jean
On 2022/4/22 下午6:11, Jean-Philippe Brucker wrote:
> On Fri, Apr 22, 2022 at 05:03:10PM +0800, [email protected] wrote:
> [...]
>>> Have tested, still got some issue with our openssl-engine.
>>>
>>> 1. If openssl-engine does not register rsa, nginx works well.
>>>
>>> 2. If openssl-engine register rsa, nginx also works, but ioasid is not
>>> freed when nginx stop.
>>>
>>> IMPLEMENT_DYNAMIC_BIND_FN(bind_fn)
>>> bind_fn
>>> ENGINE_set_RSA(e, rsa_methods())
>>>
>>> destroy_fn
>>>
>>> If ENGINE_set_RSA is set, nginx start and stop will NOT call destroy_fn.
>>> Even rsa_methods is almost new via RSA_meth_new.
>>>
>>> In 5.18-rcx, this caused ioasid not freed in nginx start and stop.
>>> In 5.17, though destroy_fn is not called, but ioasid is freed when nginx
>>> stop, so not noticed this issue before.
>> 1. uacce_fops_release
>> In 5.16 or 5.17
>> In fact, we aslo has the issue: openssl engine does not call destroy_fn ->
>> close(uacce_fd)
>> But system will automatically close all opened fd,
>> so uacce_fops_release is also called and free ioasid.
>>
>> Have one experiment, not call close fd
>>
>> log: open uacce fd but no close
>> [ 2583.471225] dump_backtrace+0x0/0x1a0
>> [ 2583.474876] show_stack+0x20/0x30
>> [ 2583.478178] dump_stack_lvl+0x8c/0xb8
>> [ 2583.481825] dump_stack+0x18/0x34
>> [ 2583.485126] uacce_fops_release+0x44/0xdc
>> [ 2583.489117] __fput+0x78/0x240
>> [ 2583.492159] ____fput+0x18/0x28
>> [ 2583.495288] task_work_run+0x88/0x160
>> [ 2583.498936] do_notify_resume+0x214/0x490
>> [ 2583.502927] el0_svc+0x58/0x70
>> [ 2583.505968] el0t_64_sync_handler+0xb0/0xb8
>> [ 2583.510132] el0t_64_sync+0x1a0/0x1a4
>> [ 2583.582292] uacce_fops_release q=00000000d6674128
>>
>> In 5.18, since refcount was add.
>> The opened uacce fd was not closed automatically by system
>> So we see the issue.
>>
>> log: open uacce fd but no close
>> [ 106.360140] uacce_fops_open q=00000000ccc38d74
>> [ 106.364929] ioasid_alloc ioasid=1
>> [ 106.368585] iommu_sva_alloc_pasid pasid=1
>> [ 106.372943] iommu_sva_bind_device handle=000000006cca298a
>> // ioasid is not free
> I'm trying to piece together what happens from the kernel point of view.
>
> * master process with mm A opens a queue fd through uacce, which calls
> iommu_sva_bind_device(dev, A) -> PASID 1
>
> * master forks and exits. Child (daemon) gets mm B, inherits the queue fd.
> The device is still bound to mm A with PASID 1, since the queue fd is
> still open.
> We discussed this before, but I don't remember where we left off. The
> child can't use the queue because its mappings are not copied on fork(),
> and the queue is still bound to the parent mm A. The child either needs to
> open a new queue or take ownership of the old one with a new uacce ioctl.
Yes, currently nginx aligned with the case.
Child process (worker process) reopen uacce,
Master process (do init) open uacce, iommu_sva_bind_device(dev, A) ->
PASID 1
Master process fork Child (daemon) and exit.
Child (daemon) does not use PASID 1 any more, only fork and manage
worker process.
Worker process reopen uacce, iommu_sva_bind_device(dev, B) PASID 2
So it is expected.
> Is that the "IMPLEMENT_DYNAMIC_BIND_FN()" you mention, something out of
> tree? This operation should unbind from A before binding to B, no?
> Otherwise we leak PASID 1.
In 5.16 PASID 1 from master is hold until nginx service stop.
nginx start
master:
iommu_sva_alloc_pasid mm->pasid=1 // master process
lynx https start:
iommu_sva_alloc_pasid mm->pasid=2 //worker process
nginx stop: from fops_release
iommu_sva_free_pasid mm->pasid=2 // worker process
iommu_sva_free_pasid mm->pasid=1 // master process
Have one silly question.
kerne driver
fops_open
iommu_sva_bind_device
fops_release
iommu_sva_unbind_device
application
main()
fd = open
return;
Application exit but not close(fd), is it expected fops_release will be
called automatically by system?
On 5.17
fops_release is called automatically, as well as iommu_sva_unbind_device.
On 5.18-rc1.
fops_release is not called, have to manually call close(fd)
Since nginx may have a issue, it does not call close(fd) when nginx -s quit.
Thanks
Hi,
On Fri, Apr 15, 2022 at 02:51:08AM -0700, Fenghua Yu wrote:
> From a6444e1e5bd8076f5e5c5e950d3192de327f0c9c Mon Sep 17 00:00:00 2001
> From: Fenghua Yu <[email protected]>
> Date: Fri, 15 Apr 2022 00:51:33 -0700
> Subject: [RFC PATCH] iommu/sva: Fix PASID use-after-free issue
>
> A PASID might be still used even though it is freed on mm exit.
>
> process A:
> sva_bind();
> ioasid_alloc() = N; // Get PASID N for the mm
> fork(): // spawn process B
> exit();
> ioasid_free(N);
>
> process B:
> device uses PASID N -> failure
> sva_unbind();
>
> Dave Hansen suggests to take a refcount on the mm whenever binding the
> PASID to a device and drop the refcount on unbinding. The mm won't be
> dropped if the PASID is still bound to it.
>
> Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm exit")
>
> Reported-by: Zhangfei Gao <[email protected]>
> Suggested-by: Dave Hansen" <[email protected]>
> Signed-off-by: Fenghua Yu <[email protected]>
> ---
> drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 6 ++++++
> drivers/iommu/intel/svm.c | 4 ++++
> 2 files changed, 10 insertions(+)
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> index 22ddd05bbdcd..3fcb842a0df0 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> @@ -7,6 +7,7 @@
> #include <linux/mmu_context.h>
> #include <linux/mmu_notifier.h>
> #include <linux/slab.h>
> +#include <linux/sched/mm.h>
>
> #include "arm-smmu-v3.h"
> #include "../../iommu-sva-lib.h"
> @@ -363,6 +364,9 @@ arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
>
> mutex_lock(&sva_lock);
> handle = __arm_smmu_sva_bind(dev, mm);
> + /* Take an mm refcount on a successful bind. */
> + if (!IS_ERR(handle))
> + mmget(mm);
> mutex_unlock(&sva_lock);
> return handle;
> }
> @@ -372,6 +376,8 @@ void arm_smmu_sva_unbind(struct iommu_sva *handle)
> struct arm_smmu_bond *bond = sva_to_bond(handle);
>
> mutex_lock(&sva_lock);
> + /* Drop an mm refcount. */
> + mmput(bond->mm);
I do like the idea because it will simplify the driver. We can't call
mmput() here, though, because it may call the release() MMU notifier which
will try to grab sva_lock, already held.
I also found another use-after-free in arm_smmu_free_shared_cd(), where we
call arm64_mm_context_put() when the mm could already be freed. There used
to be an mmgrab() preventing this but it went away during a rewrite.
To fix both we could just move mmput() at the end of unbind() but I'd
rather do a proper cleanup removing the release() notifier right away.
Zhangfei, could you try the patch below?
Thanks,
Jean
--- 8< ---
From 4e09c0d71dfb35fc90915bd1e36545027fbf8a03 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <[email protected]>
Date: Wed, 20 Apr 2022 10:19:24 +0100
Subject: [PATCH] iommu/arm-smmu-v3-sva: Fix PASID and mm use-after-free issues
Commit 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID
allocation and free it on mm exit") frees the PASID earlier than what
the SMMUv3 driver expects. At the moment the SMMU driver handles mm exit
in the release() MMU notifier by quiescing the context descriptor. The
context descriptor is only made invalid in unbind(), after the device
driver ensured the PASID is not used anymore. Releasing the PASID on mm
exit may cause it to be reallocated while it is still used by the
context descriptor.
There is another use-after-free, present since the beginning, where we
call arm64_mm_context_put() without a guarantee that mm_count is held.
Dave Hansen suggests to grab mm_users whenever binding the mm to a
device and drop it on unbinding. With that we can fix both issues and
simplify the driver by removing the release() notifier.
Fixes: 32784a9562fb ("iommu/arm-smmu-v3: Implement iommu_sva_bind/unbind()")
Reported-by: Zhangfei Gao <[email protected]>
Suggested-by: Dave Hansen <[email protected]>
Signed-off-by: Fenghua Yu <[email protected]>
Signed-off-by: Jean-Philippe Brucker <[email protected]>
---
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 -
.../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 49 +++++--------------
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 14 +-----
3 files changed, 15 insertions(+), 49 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index cd48590ada30..d50d215d946c 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -735,7 +735,6 @@ static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
extern struct xarray arm_smmu_asid_xa;
extern struct mutex arm_smmu_asid_lock;
-extern struct arm_smmu_ctx_desc quiet_cd;
int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid,
struct arm_smmu_ctx_desc *cd);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 22ddd05bbdcd..f9dff0f6cdd4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -6,6 +6,7 @@
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/mmu_notifier.h>
+#include <linux/sched/mm.h>
#include <linux/slab.h>
#include "arm-smmu-v3.h"
@@ -15,7 +16,6 @@
struct arm_smmu_mmu_notifier {
struct mmu_notifier mn;
struct arm_smmu_ctx_desc *cd;
- bool cleared;
refcount_t refs;
struct list_head list;
struct arm_smmu_domain *domain;
@@ -96,9 +96,14 @@ static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
struct arm_smmu_ctx_desc *cd;
struct arm_smmu_ctx_desc *ret = NULL;
+ /* Prevent mm exit as long as it is used by the context descriptor */
+ mmget(mm);
+
asid = arm64_mm_context_get(mm);
- if (!asid)
- return ERR_PTR(-ESRCH);
+ if (!asid) {
+ err = -ESRCH;
+ goto out_put_mm;
+ }
cd = kzalloc(sizeof(*cd), GFP_KERNEL);
if (!cd) {
@@ -165,6 +170,8 @@ static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
kfree(cd);
out_put_context:
arm64_mm_context_put(mm);
+out_put_mm:
+ mmput(mm);
return err < 0 ? ERR_PTR(err) : ret;
}
@@ -173,6 +180,7 @@ static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd)
if (arm_smmu_free_asid(cd)) {
/* Unpin ASID */
arm64_mm_context_put(cd->mm);
+ mmput(cd->mm);
kfree(cd);
}
}
@@ -191,30 +199,6 @@ static void arm_smmu_mm_invalidate_range(struct mmu_notifier *mn,
arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, start, size);
}
-static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
-{
- struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn);
- struct arm_smmu_domain *smmu_domain = smmu_mn->domain;
-
- mutex_lock(&sva_lock);
- if (smmu_mn->cleared) {
- mutex_unlock(&sva_lock);
- return;
- }
-
- /*
- * DMA may still be running. Keep the cd valid to avoid C_BAD_CD events,
- * but disable translation.
- */
- arm_smmu_write_ctx_desc(smmu_domain, mm->pasid, &quiet_cd);
-
- arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid);
- arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
-
- smmu_mn->cleared = true;
- mutex_unlock(&sva_lock);
-}
-
static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn)
{
kfree(mn_to_smmu(mn));
@@ -222,7 +206,6 @@ static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn)
static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = {
.invalidate_range = arm_smmu_mm_invalidate_range,
- .release = arm_smmu_mm_release,
.free_notifier = arm_smmu_mmu_notifier_free,
};
@@ -290,14 +273,8 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn)
list_del(&smmu_mn->list);
arm_smmu_write_ctx_desc(smmu_domain, mm->pasid, NULL);
- /*
- * If we went through clear(), we've already invalidated, and no
- * new TLB entry can have been formed.
- */
- if (!smmu_mn->cleared) {
- arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
- arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
- }
+ arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
+ arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
/* Frees smmu_mn */
mmu_notifier_put(&smmu_mn->mn);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 627a3ed5ee8f..246670318eda 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -76,12 +76,6 @@ struct arm_smmu_option_prop {
DEFINE_XARRAY_ALLOC1(arm_smmu_asid_xa);
DEFINE_MUTEX(arm_smmu_asid_lock);
-/*
- * Special value used by SVA when a process dies, to quiesce a CD without
- * disabling it.
- */
-struct arm_smmu_ctx_desc quiet_cd = { 0 };
-
static struct arm_smmu_option_prop arm_smmu_options[] = {
{ ARM_SMMU_OPT_SKIP_PREFETCH, "hisilicon,broken-prefetch-cmd" },
{ ARM_SMMU_OPT_PAGE0_REGS_ONLY, "cavium,cn9900-broken-page1-regspace"},
@@ -1047,9 +1041,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid,
* (2) Install a secondary CD, for SID+SSID traffic.
* (3) Update ASID of a CD. Atomically write the first 64 bits of the
* CD, then invalidate the old entry and mappings.
- * (4) Quiesce the context without clearing the valid bit. Disable
- * translation, and ignore any translation fault.
- * (5) Remove a secondary CD.
+ * (4) Remove a secondary CD.
*/
u64 val;
bool cd_live;
@@ -1065,10 +1057,8 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid,
val = le64_to_cpu(cdptr[0]);
cd_live = !!(val & CTXDESC_CD_0_V);
- if (!cd) { /* (5) */
+ if (!cd) { /* (4) */
val = 0;
- } else if (cd == &quiet_cd) { /* (4) */
- val |= CTXDESC_CD_0_TCR_EPD0;
} else if (cd_live) { /* (3) */
val &= ~CTXDESC_CD_0_ASID;
val |= FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid);
--
2.35.1
Hi, Jean
On 2022/4/21 下午2:47, [email protected] wrote:
>
>
> On 2022/4/21 上午12:45, Jean-Philippe Brucker wrote:
>> Hi,
>>
>> On Fri, Apr 15, 2022 at 02:51:08AM -0700, Fenghua Yu wrote:
>>> From a6444e1e5bd8076f5e5c5e950d3192de327f0c9c Mon Sep 17 00:00:00 2001
>>> From: Fenghua Yu <[email protected]>
>>> Date: Fri, 15 Apr 2022 00:51:33 -0700
>>> Subject: [RFC PATCH] iommu/sva: Fix PASID use-after-free issue
>>>
>>> A PASID might be still used even though it is freed on mm exit.
>>>
>>> process A:
>>> sva_bind();
>>> ioasid_alloc() = N; // Get PASID N for the mm
>>> fork(): // spawn process B
>>> exit();
>>> ioasid_free(N);
>>>
>>> process B:
>>> device uses PASID N -> failure
>>> sva_unbind();
>>>
>>> Dave Hansen suggests to take a refcount on the mm whenever binding the
>>> PASID to a device and drop the refcount on unbinding. The mm won't be
>>> dropped if the PASID is still bound to it.
>>>
>>> Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID
>>> allocation and free it on mm exit")
>>>
>>> Reported-by: Zhangfei Gao <[email protected]>
>>> Suggested-by: Dave Hansen" <[email protected]>
>>> Signed-off-by: Fenghua Yu <[email protected]>
>>> ---
>>> drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 6 ++++++
>>> drivers/iommu/intel/svm.c | 4 ++++
>>> 2 files changed, 10 insertions(+)
>>>
>>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>>> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>>> index 22ddd05bbdcd..3fcb842a0df0 100644
>>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>>> @@ -7,6 +7,7 @@
>>> #include <linux/mmu_context.h>
>>> #include <linux/mmu_notifier.h>
>>> #include <linux/slab.h>
>>> +#include <linux/sched/mm.h>
>>> #include "arm-smmu-v3.h"
>>> #include "../../iommu-sva-lib.h"
>>> @@ -363,6 +364,9 @@ arm_smmu_sva_bind(struct device *dev, struct
>>> mm_struct *mm, void *drvdata)
>>> mutex_lock(&sva_lock);
>>> handle = __arm_smmu_sva_bind(dev, mm);
>>> + /* Take an mm refcount on a successful bind. */
>>> + if (!IS_ERR(handle))
>>> + mmget(mm);
>>> mutex_unlock(&sva_lock);
>>> return handle;
>>> }
>>> @@ -372,6 +376,8 @@ void arm_smmu_sva_unbind(struct iommu_sva *handle)
>>> struct arm_smmu_bond *bond = sva_to_bond(handle);
>>> mutex_lock(&sva_lock);
>>> + /* Drop an mm refcount. */
>>> + mmput(bond->mm);
>> I do like the idea because it will simplify the driver. We can't call
>> mmput() here, though, because it may call the release() MMU notifier
>> which
>> will try to grab sva_lock, already held.
>>
>> I also found another use-after-free in arm_smmu_free_shared_cd(),
>> where we
>> call arm64_mm_context_put() when the mm could already be freed. There
>> used
>> to be an mmgrab() preventing this but it went away during a rewrite.
>>
>> To fix both we could just move mmput() at the end of unbind() but I'd
>> rather do a proper cleanup removing the release() notifier right away.
>> Zhangfei, could you try the patch below?
>>
>> Thanks,
>> Jean
>>
>> --- 8< ---
>>
>> From 4e09c0d71dfb35fc90915bd1e36545027fbf8a03 Mon Sep 17 00:00:00 2001
>> From: Jean-Philippe Brucker <[email protected]>
>> Date: Wed, 20 Apr 2022 10:19:24 +0100
>> Subject: [PATCH] iommu/arm-smmu-v3-sva: Fix PASID and mm
>> use-after-free issues
>>
>> Commit 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID
>> allocation and free it on mm exit") frees the PASID earlier than what
>> the SMMUv3 driver expects. At the moment the SMMU driver handles mm exit
>> in the release() MMU notifier by quiescing the context descriptor. The
>> context descriptor is only made invalid in unbind(), after the device
>> driver ensured the PASID is not used anymore. Releasing the PASID on mm
>> exit may cause it to be reallocated while it is still used by the
>> context descriptor.
>>
>> There is another use-after-free, present since the beginning, where we
>> call arm64_mm_context_put() without a guarantee that mm_count is held.
>>
>> Dave Hansen suggests to grab mm_users whenever binding the mm to a
>> device and drop it on unbinding. With that we can fix both issues and
>> simplify the driver by removing the release() notifier.
>>
>> Fixes: 32784a9562fb ("iommu/arm-smmu-v3: Implement
>> iommu_sva_bind/unbind()")
>> Reported-by: Zhangfei Gao <[email protected]>
>> Suggested-by: Dave Hansen <[email protected]>
>> Signed-off-by: Fenghua Yu <[email protected]>
>> Signed-off-by: Jean-Philippe Brucker <[email protected]>
>> ---
>> drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 -
>> .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 49 +++++--------------
>> drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 14 +-----
>> 3 files changed, 15 insertions(+), 49 deletions(-)
>>
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
>> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
>> index cd48590ada30..d50d215d946c 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
>> @@ -735,7 +735,6 @@ static inline struct arm_smmu_domain
>> *to_smmu_domain(struct iommu_domain *dom)
>> extern struct xarray arm_smmu_asid_xa;
>> extern struct mutex arm_smmu_asid_lock;
>> -extern struct arm_smmu_ctx_desc quiet_cd;
>> int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain,
>> int ssid,
>> struct arm_smmu_ctx_desc *cd);
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>> index 22ddd05bbdcd..f9dff0f6cdd4 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>> @@ -6,6 +6,7 @@
>> #include <linux/mm.h>
>> #include <linux/mmu_context.h>
>> #include <linux/mmu_notifier.h>
>> +#include <linux/sched/mm.h>
>> #include <linux/slab.h>
>> #include "arm-smmu-v3.h"
>> @@ -15,7 +16,6 @@
>> struct arm_smmu_mmu_notifier {
>> struct mmu_notifier mn;
>> struct arm_smmu_ctx_desc *cd;
>> - bool cleared;
>> refcount_t refs;
>> struct list_head list;
>> struct arm_smmu_domain *domain;
>> @@ -96,9 +96,14 @@ static struct arm_smmu_ctx_desc
>> *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
>> struct arm_smmu_ctx_desc *cd;
>> struct arm_smmu_ctx_desc *ret = NULL;
>> + /* Prevent mm exit as long as it is used by the context
>> descriptor */
>> + mmget(mm);
>> +
>> asid = arm64_mm_context_get(mm);
>> - if (!asid)
>> - return ERR_PTR(-ESRCH);
>> + if (!asid) {
>> + err = -ESRCH;
>> + goto out_put_mm;
>> + }
>> cd = kzalloc(sizeof(*cd), GFP_KERNEL);
>> if (!cd) {
>> @@ -165,6 +170,8 @@ static struct arm_smmu_ctx_desc
>> *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
>> kfree(cd);
>> out_put_context:
>> arm64_mm_context_put(mm);
>> +out_put_mm:
>> + mmput(mm);
>> return err < 0 ? ERR_PTR(err) : ret;
>> }
>> @@ -173,6 +180,7 @@ static void arm_smmu_free_shared_cd(struct
>> arm_smmu_ctx_desc *cd)
>> if (arm_smmu_free_asid(cd)) {
>> /* Unpin ASID */
>> arm64_mm_context_put(cd->mm);
>> + mmput(cd->mm);
>> kfree(cd);
>> }
>> }
>> @@ -191,30 +199,6 @@ static void arm_smmu_mm_invalidate_range(struct
>> mmu_notifier *mn,
>> arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, start, size);
>> }
>> -static void arm_smmu_mm_release(struct mmu_notifier *mn, struct
>> mm_struct *mm)
>> -{
>> - struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn);
>> - struct arm_smmu_domain *smmu_domain = smmu_mn->domain;
>> -
>> - mutex_lock(&sva_lock);
>> - if (smmu_mn->cleared) {
>> - mutex_unlock(&sva_lock);
>> - return;
>> - }
>> -
>> - /*
>> - * DMA may still be running. Keep the cd valid to avoid C_BAD_CD
>> events,
>> - * but disable translation.
>> - */
>> - arm_smmu_write_ctx_desc(smmu_domain, mm->pasid, &quiet_cd);
>> -
>> - arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid);
>> - arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
>> -
>> - smmu_mn->cleared = true;
>> - mutex_unlock(&sva_lock);
>> -}
>> -
>> static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn)
>> {
>> kfree(mn_to_smmu(mn));
>> @@ -222,7 +206,6 @@ static void arm_smmu_mmu_notifier_free(struct
>> mmu_notifier *mn)
>> static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = {
>> .invalidate_range = arm_smmu_mm_invalidate_range,
>> - .release = arm_smmu_mm_release,
>> .free_notifier = arm_smmu_mmu_notifier_free,
>> };
>> @@ -290,14 +273,8 @@ static void arm_smmu_mmu_notifier_put(struct
>> arm_smmu_mmu_notifier *smmu_mn)
>> list_del(&smmu_mn->list);
>> arm_smmu_write_ctx_desc(smmu_domain, mm->pasid, NULL);
>> - /*
>> - * If we went through clear(), we've already invalidated, and no
>> - * new TLB entry can have been formed.
>> - */
>> - if (!smmu_mn->cleared) {
>> - arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
>> - arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
>> - }
>> + arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
>> + arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
>> /* Frees smmu_mn */
>> mmu_notifier_put(&smmu_mn->mn);
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> index 627a3ed5ee8f..246670318eda 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> @@ -76,12 +76,6 @@ struct arm_smmu_option_prop {
>> DEFINE_XARRAY_ALLOC1(arm_smmu_asid_xa);
>> DEFINE_MUTEX(arm_smmu_asid_lock);
>> -/*
>> - * Special value used by SVA when a process dies, to quiesce a CD
>> without
>> - * disabling it.
>> - */
>> -struct arm_smmu_ctx_desc quiet_cd = { 0 };
>> -
>> static struct arm_smmu_option_prop arm_smmu_options[] = {
>> { ARM_SMMU_OPT_SKIP_PREFETCH, "hisilicon,broken-prefetch-cmd" },
>> { ARM_SMMU_OPT_PAGE0_REGS_ONLY,
>> "cavium,cn9900-broken-page1-regspace"},
>> @@ -1047,9 +1041,7 @@ int arm_smmu_write_ctx_desc(struct
>> arm_smmu_domain *smmu_domain, int ssid,
>> * (2) Install a secondary CD, for SID+SSID traffic.
>> * (3) Update ASID of a CD. Atomically write the first 64 bits
>> of the
>> * CD, then invalidate the old entry and mappings.
>> - * (4) Quiesce the context without clearing the valid bit. Disable
>> - * translation, and ignore any translation fault.
>> - * (5) Remove a secondary CD.
>> + * (4) Remove a secondary CD.
>> */
>> u64 val;
>> bool cd_live;
>> @@ -1065,10 +1057,8 @@ int arm_smmu_write_ctx_desc(struct
>> arm_smmu_domain *smmu_domain, int ssid,
>> val = le64_to_cpu(cdptr[0]);
>> cd_live = !!(val & CTXDESC_CD_0_V);
>> - if (!cd) { /* (5) */
>> + if (!cd) { /* (4) */
>> val = 0;
>> - } else if (cd == &quiet_cd) { /* (4) */
>> - val |= CTXDESC_CD_0_TCR_EPD0;
>> } else if (cd_live) { /* (3) */
>> val &= ~CTXDESC_CD_0_ASID;
>> val |= FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid);
> Thanks Jean
>
> Have tested, still got some issue with our openssl-engine.
>
> 1. If openssl-engine does not register rsa, nginx works well.
>
> 2. If openssl-engine register rsa, nginx also works, but ioasid is not
> freed when nginx stop.
>
> IMPLEMENT_DYNAMIC_BIND_FN(bind_fn)
> bind_fn
> ENGINE_set_RSA(e, rsa_methods())
>
> destroy_fn
>
> If ENGINE_set_RSA is set, nginx start and stop will NOT call destroy_fn.
> Even rsa_methods is almost new via RSA_meth_new.
>
> In 5.18-rcx, this caused ioasid not freed in nginx start and stop.
> In 5.17, though destroy_fn is not called, but ioasid is freed when
> nginx stop, so not noticed this issue before.
1. uacce_fops_release
In 5.16 or 5.17
In fact, we aslo has the issue: openssl engine does not call destroy_fn
-> close(uacce_fd)
But system will automatically close all opened fd,
so uacce_fops_release is also called and free ioasid.
Have one experiment, not call close fd
log: open uacce fd but no close
[ 2583.471225] dump_backtrace+0x0/0x1a0
[ 2583.474876] show_stack+0x20/0x30
[ 2583.478178] dump_stack_lvl+0x8c/0xb8
[ 2583.481825] dump_stack+0x18/0x34
[ 2583.485126] uacce_fops_release+0x44/0xdc
[ 2583.489117] __fput+0x78/0x240
[ 2583.492159] ____fput+0x18/0x28
[ 2583.495288] task_work_run+0x88/0x160
[ 2583.498936] do_notify_resume+0x214/0x490
[ 2583.502927] el0_svc+0x58/0x70
[ 2583.505968] el0t_64_sync_handler+0xb0/0xb8
[ 2583.510132] el0t_64_sync+0x1a0/0x1a4
[ 2583.582292] uacce_fops_release q=00000000d6674128
In 5.18, since refcount was add.
The opened uacce fd was not closed automatically by system
So we see the issue.
log: open uacce fd but no close
[ 106.360140] uacce_fops_open q=00000000ccc38d74
[ 106.364929] ioasid_alloc ioasid=1
[ 106.368585] iommu_sva_alloc_pasid pasid=1
[ 106.372943] iommu_sva_bind_device handle=000000006cca298a
// ioasid is not free
2. About the openssl engine destroy_fn
Only if we register rsa, even empry, the destroy_fn will not be called.
The reason is engine refcount is not 0, so not call e->destroy(e).
Not sure it is openssl issue or nginx issue.
IMPLEMENT_DYNAMIC_BIND_FN(bind_fn)
bind_fn
ENGINE_set_destroy_function(e, destroy_fn);
ENGINE_set_RSA(e, RSA_meth_new("rsa method", 0)); // here just add
empty RSA
In openssl
engine_free_util
CRYPTO_DOWN_REF(&e->struct_ref, &i, global_engine_lock);
if (i > 0)
return 1;
if (e->destroy)
e->destroy(e);
If we register RSA, ENGINE_get_default_RSA will be called twice, adding
e->struct_ref + 2, so at last e->destroy(e) will not be called.
But the refcount is not decreased accordingly.
Basic calling sequence:
nginx:
src/http/modules/ngx_http_ssl_module.c
ngx_ssl_certificates
ngx_ssl_certificate
ngx_ssl_load_certificate-> PEM_ASN1_read_bio-> rsa_cb-> RSA_new
ngx_ssl_load_certificate_key-> PEM_read_bio_PrivateKey-> rsa_cb-> RSA_new
RSA_new ->ENGINE_get_default_RSA
more detail log:
First time call RSA_new, refcount +1
ngx_ssl_load_certificate-> PEM_ASN1_read_bio-> rsa_cb-> RSA_new
#0 0x0000fffff7b8cae0 in RSA_new () from /usr/local/lib/libcrypto.so.1.1
#1 0x0000fffff7b8b0bc in rsa_cb () from /usr/local/lib/libcrypto.so.1.1
#2 0x0000fffff7a952d4 in ASN1_item_ex_new () from
/usr/local/lib/libcrypto.so.1.1
#3 0x0000fffff7a9300c in ASN1_item_ex_d2i () from
/usr/local/lib/libcrypto.so.1.1
#4 0x0000fffff7a93474 in ASN1_item_d2i () from
/usr/local/lib/libcrypto.so.1.1
#5 0x0000fffff7b89654 in rsa_pub_decode () from
/usr/local/lib/libcrypto.so.1.1
#6 0x0000fffff7bc5c08 in x509_pubkey_decode () from
/usr/local/lib/libcrypto.so.1.1
#7 0x0000fffff7bc5d34 in pubkey_cb () from /usr/local/lib/libcrypto.so.1.1
#8 0x0000fffff7a921d4 in asn1_item_embed_d2i () from
/usr/local/lib/libcrypto.so.1.1
#9 0x0000fffff7a92674 in asn1_template_noexp_d2i () from
/usr/local/lib/libcrypto.so.1.1
#10 0x0000fffff7a92904 in asn1_template_ex_d2i () from
/usr/local/lib/libcrypto.so.1.1
#11 0x0000fffff7a92038 in asn1_item_embed_d2i () from
/usr/local/lib/libcrypto.so.1.1
#12 0x0000fffff7a92674 in asn1_template_noexp_d2i () from
/usr/local/lib/libcrypto.so.1.1
#13 0x0000fffff7a92904 in asn1_template_ex_d2i () from
/usr/local/lib/libcrypto.so.1.1
#14 0x0000fffff7a9306c in ASN1_item_ex_d2i () from
/usr/local/lib/libcrypto.so.1.1
#15 0x0000fffff7a93474 in ASN1_item_d2i () from
/usr/local/lib/libcrypto.so.1.1
#16 0x0000fffff7bc6860 in d2i_X509_AUX () from
/usr/local/lib/libcrypto.so.1.1
#17 0x0000fffff7b72adc in PEM_ASN1_read_bio () from
/usr/local/lib/libcrypto.so.1.1
#18 0x0000aaaaaaaf5454 in ngx_ssl_load_certificate (pool=<optimized
out>, err=err@entry=0xffffffffe9e8,
cert=cert@entry=0xaaaaaac0ba30, chain=chain@entry=0xffffffffe9f0)
at src/event/ngx_event_openssl.c:648
#19 0x0000aaaaaaaf7330 in ngx_ssl_certificate
(cf=cf@entry=0xffffffffef58, ssl=ssl@entry=0xaaaaaac0a6b8,
cert=cert@entry=0xaaaaaac0ba30, key=key@entry=0xaaaaaac0bad0,
passwords=passwords@entry=0x0)
at src/event/ngx_event_openssl.c:430
#20 0x0000aaaaaaaf7658 in ngx_ssl_certificates
(cf=cf@entry=0xffffffffef58, ssl=ssl@entry=0xaaaaaac0a6b8,
certs=0xaaaaaac0ba00, keys=<optimized out>, passwords=0x0) at
src/event/ngx_event_openssl.c:410
#21 0x0000aaaaaab498f0 in ngx_http_ssl_merge_srv_conf
(cf=0xffffffffef58, parent=0xaaaaaabf2a00,
child=0xaaaaaac0a6b0) at src/http/modules/ngx_http_ssl_module.c:787
#22 0x0000aaaaaab01c40 in ngx_http_merge_servers (ctx_index=17,
module=0xaaaaaaba86b0 <ngx_http_ssl_module_ctx>,
cmcf=0xaaaaaabf1ec0, cf=0xffffffffef58) at src/http/ngx_http.c:584
#23 ngx_http_block (cf=0xffffffffef58, cmd=<optimized out>,
conf=<optimized out>) at src/http/ngx_http.c:270
#24 0x0000aaaaaaad8d78 in ngx_conf_handler (last=1, cf=0xffffffffef58)
at src/core/ngx_conf_file.c:463
#25 ngx_conf_parse (cf=cf@entry=0xffffffffef58,
filename=filename@entry=0xaaaaaabeef58)
at src/core/ngx_conf_file.c:319
#26 0x0000aaaaaaad628c in ngx_init_cycle (old_cycle=0xfffffffff170) at
src/core/ngx_cycle.c:284
#27 0x0000aaaaaaac1b80 in main (argc=<optimized out>, argv=<optimized
out>) at src/core/nginx.c:292
Second time call RSA_new: refcount +1
ngx_ssl_load_certificate_key-> PEM_read_bio_PrivateKey-> rsa_cb-> RSA_new
#0 0x0000fffff7b8cae0 in RSA_new () from /usr/local/lib/libcrypto.so.1.1
#1 0x0000fffff7b8b0bc in rsa_cb () from /usr/local/lib/libcrypto.so.1.1
#2 0x0000fffff7a952d4 in ASN1_item_ex_new () from
/usr/local/lib/libcrypto.so.1.1
#3 0x0000fffff7a9300c in ASN1_item_ex_d2i () from
/usr/local/lib/libcrypto.so.1.1
#4 0x0000fffff7a93474 in ASN1_item_d2i () from
/usr/local/lib/libcrypto.so.1.1
#5 0x0000fffff7b89cbc in rsa_priv_decode () from
/usr/local/lib/libcrypto.so.1.1
#6 0x0000fffff7b4d1ac in EVP_PKCS82PKEY () from
/usr/local/lib/libcrypto.so.1.1
#7 0x0000fffff7b73730 in PEM_read_bio_PrivateKey () from
/usr/local/lib/libcrypto.so.1.1
#8 0x0000aaaaaaaf376c in ngx_ssl_load_certificate_key (pool=<optimized
out>, err=err@entry=0xffffffffe9d8,
key=key@entry=0xaaaaaac08ad0, passwords=passwords@entry=0x0) at
src/event/ngx_event_openssl.c:800
#9 0x0000aaaaaaaf5430 in ngx_ssl_certificate
(cf=cf@entry=0xffffffffef58, ssl=ssl@entry=0xaaaaaac076b8,
cert=cert@entry=0xaaaaaac08a30, key=key@entry=0xaaaaaac08ad0,
passwords=passwords@entry=0x0)
at src/event/ngx_event_openssl.c:521
#10 0x0000aaaaaaaf56e8 in ngx_ssl_certificates
(cf=cf@entry=0xffffffffef58, ssl=ssl@entry=0xaaaaaac076b8,
certs=0xaaaaaac08a00, keys=<optimized out>, passwords=0x0) at
src/event/ngx_event_openssl.c:413
#11 0x0000aaaaaab46980 in ngx_http_ssl_merge_srv_conf
(cf=0xffffffffef58, parent=0xaaaaaabefa00,
child=0xaaaaaac076b0) at src/http/modules/ngx_http_ssl_module.c:787
#12 0x0000aaaaaaaffcd0 in ngx_http_merge_servers (ctx_index=17,
module=0xaaaaaaba56b0 <ngx_http_ssl_module_ctx>,
cmcf=0xaaaaaabeeec0, cf=0xffffffffef58) at src/http/ngx_http.c:584
#13 ngx_http_block (cf=0xffffffffef58, cmd=<optimized out>,
conf=<optimized out>) at src/http/ngx_http.c:270
#14 0x0000aaaaaaad8dc8 in ngx_conf_handler (last=1, cf=0xffffffffef58)
at src/core/ngx_conf_file.c:463
#15 ngx_conf_parse (cf=cf@entry=0xffffffffef58,
filename=filename@entry=0xaaaaaabebf58)
at src/core/ngx_conf_file.c:319
#16 0x0000aaaaaaad62dc in ngx_init_cycle (old_cycle=0xfffffffff170) at
src/core/ngx_cycle.c:284
#17 0x0000aaaaaaac1bd0 in main (argc=<optimized out>, argv=<optimized
out>) at src/core/nginx.c:292
Thanks
On 2022/4/21 上午12:45, Jean-Philippe Brucker wrote:
> Hi,
>
> On Fri, Apr 15, 2022 at 02:51:08AM -0700, Fenghua Yu wrote:
>> From a6444e1e5bd8076f5e5c5e950d3192de327f0c9c Mon Sep 17 00:00:00 2001
>> From: Fenghua Yu <[email protected]>
>> Date: Fri, 15 Apr 2022 00:51:33 -0700
>> Subject: [RFC PATCH] iommu/sva: Fix PASID use-after-free issue
>>
>> A PASID might be still used even though it is freed on mm exit.
>>
>> process A:
>> sva_bind();
>> ioasid_alloc() = N; // Get PASID N for the mm
>> fork(): // spawn process B
>> exit();
>> ioasid_free(N);
>>
>> process B:
>> device uses PASID N -> failure
>> sva_unbind();
>>
>> Dave Hansen suggests to take a refcount on the mm whenever binding the
>> PASID to a device and drop the refcount on unbinding. The mm won't be
>> dropped if the PASID is still bound to it.
>>
>> Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm exit")
>>
>> Reported-by: Zhangfei Gao <[email protected]>
>> Suggested-by: Dave Hansen" <[email protected]>
>> Signed-off-by: Fenghua Yu <[email protected]>
>> ---
>> drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 6 ++++++
>> drivers/iommu/intel/svm.c | 4 ++++
>> 2 files changed, 10 insertions(+)
>>
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>> index 22ddd05bbdcd..3fcb842a0df0 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>> @@ -7,6 +7,7 @@
>> #include <linux/mmu_context.h>
>> #include <linux/mmu_notifier.h>
>> #include <linux/slab.h>
>> +#include <linux/sched/mm.h>
>>
>> #include "arm-smmu-v3.h"
>> #include "../../iommu-sva-lib.h"
>> @@ -363,6 +364,9 @@ arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
>>
>> mutex_lock(&sva_lock);
>> handle = __arm_smmu_sva_bind(dev, mm);
>> + /* Take an mm refcount on a successful bind. */
>> + if (!IS_ERR(handle))
>> + mmget(mm);
>> mutex_unlock(&sva_lock);
>> return handle;
>> }
>> @@ -372,6 +376,8 @@ void arm_smmu_sva_unbind(struct iommu_sva *handle)
>> struct arm_smmu_bond *bond = sva_to_bond(handle);
>>
>> mutex_lock(&sva_lock);
>> + /* Drop an mm refcount. */
>> + mmput(bond->mm);
> I do like the idea because it will simplify the driver. We can't call
> mmput() here, though, because it may call the release() MMU notifier which
> will try to grab sva_lock, already held.
>
> I also found another use-after-free in arm_smmu_free_shared_cd(), where we
> call arm64_mm_context_put() when the mm could already be freed. There used
> to be an mmgrab() preventing this but it went away during a rewrite.
>
> To fix both we could just move mmput() at the end of unbind() but I'd
> rather do a proper cleanup removing the release() notifier right away.
> Zhangfei, could you try the patch below?
>
> Thanks,
> Jean
>
> --- 8< ---
>
> From 4e09c0d71dfb35fc90915bd1e36545027fbf8a03 Mon Sep 17 00:00:00 2001
> From: Jean-Philippe Brucker <[email protected]>
> Date: Wed, 20 Apr 2022 10:19:24 +0100
> Subject: [PATCH] iommu/arm-smmu-v3-sva: Fix PASID and mm use-after-free issues
>
> Commit 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID
> allocation and free it on mm exit") frees the PASID earlier than what
> the SMMUv3 driver expects. At the moment the SMMU driver handles mm exit
> in the release() MMU notifier by quiescing the context descriptor. The
> context descriptor is only made invalid in unbind(), after the device
> driver ensured the PASID is not used anymore. Releasing the PASID on mm
> exit may cause it to be reallocated while it is still used by the
> context descriptor.
>
> There is another use-after-free, present since the beginning, where we
> call arm64_mm_context_put() without a guarantee that mm_count is held.
>
> Dave Hansen suggests to grab mm_users whenever binding the mm to a
> device and drop it on unbinding. With that we can fix both issues and
> simplify the driver by removing the release() notifier.
>
> Fixes: 32784a9562fb ("iommu/arm-smmu-v3: Implement iommu_sva_bind/unbind()")
> Reported-by: Zhangfei Gao <[email protected]>
> Suggested-by: Dave Hansen <[email protected]>
> Signed-off-by: Fenghua Yu <[email protected]>
> Signed-off-by: Jean-Philippe Brucker <[email protected]>
> ---
> drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 -
> .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 49 +++++--------------
> drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 14 +-----
> 3 files changed, 15 insertions(+), 49 deletions(-)
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> index cd48590ada30..d50d215d946c 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> @@ -735,7 +735,6 @@ static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
>
> extern struct xarray arm_smmu_asid_xa;
> extern struct mutex arm_smmu_asid_lock;
> -extern struct arm_smmu_ctx_desc quiet_cd;
>
> int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid,
> struct arm_smmu_ctx_desc *cd);
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> index 22ddd05bbdcd..f9dff0f6cdd4 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> @@ -6,6 +6,7 @@
> #include <linux/mm.h>
> #include <linux/mmu_context.h>
> #include <linux/mmu_notifier.h>
> +#include <linux/sched/mm.h>
> #include <linux/slab.h>
>
> #include "arm-smmu-v3.h"
> @@ -15,7 +16,6 @@
> struct arm_smmu_mmu_notifier {
> struct mmu_notifier mn;
> struct arm_smmu_ctx_desc *cd;
> - bool cleared;
> refcount_t refs;
> struct list_head list;
> struct arm_smmu_domain *domain;
> @@ -96,9 +96,14 @@ static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
> struct arm_smmu_ctx_desc *cd;
> struct arm_smmu_ctx_desc *ret = NULL;
>
> + /* Prevent mm exit as long as it is used by the context descriptor */
> + mmget(mm);
> +
> asid = arm64_mm_context_get(mm);
> - if (!asid)
> - return ERR_PTR(-ESRCH);
> + if (!asid) {
> + err = -ESRCH;
> + goto out_put_mm;
> + }
>
> cd = kzalloc(sizeof(*cd), GFP_KERNEL);
> if (!cd) {
> @@ -165,6 +170,8 @@ static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
> kfree(cd);
> out_put_context:
> arm64_mm_context_put(mm);
> +out_put_mm:
> + mmput(mm);
> return err < 0 ? ERR_PTR(err) : ret;
> }
>
> @@ -173,6 +180,7 @@ static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd)
> if (arm_smmu_free_asid(cd)) {
> /* Unpin ASID */
> arm64_mm_context_put(cd->mm);
> + mmput(cd->mm);
> kfree(cd);
> }
> }
> @@ -191,30 +199,6 @@ static void arm_smmu_mm_invalidate_range(struct mmu_notifier *mn,
> arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, start, size);
> }
>
> -static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> -{
> - struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn);
> - struct arm_smmu_domain *smmu_domain = smmu_mn->domain;
> -
> - mutex_lock(&sva_lock);
> - if (smmu_mn->cleared) {
> - mutex_unlock(&sva_lock);
> - return;
> - }
> -
> - /*
> - * DMA may still be running. Keep the cd valid to avoid C_BAD_CD events,
> - * but disable translation.
> - */
> - arm_smmu_write_ctx_desc(smmu_domain, mm->pasid, &quiet_cd);
> -
> - arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid);
> - arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
> -
> - smmu_mn->cleared = true;
> - mutex_unlock(&sva_lock);
> -}
> -
> static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn)
> {
> kfree(mn_to_smmu(mn));
> @@ -222,7 +206,6 @@ static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn)
>
> static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = {
> .invalidate_range = arm_smmu_mm_invalidate_range,
> - .release = arm_smmu_mm_release,
> .free_notifier = arm_smmu_mmu_notifier_free,
> };
>
> @@ -290,14 +273,8 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn)
> list_del(&smmu_mn->list);
> arm_smmu_write_ctx_desc(smmu_domain, mm->pasid, NULL);
>
> - /*
> - * If we went through clear(), we've already invalidated, and no
> - * new TLB entry can have been formed.
> - */
> - if (!smmu_mn->cleared) {
> - arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
> - arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
> - }
> + arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
> + arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
>
> /* Frees smmu_mn */
> mmu_notifier_put(&smmu_mn->mn);
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 627a3ed5ee8f..246670318eda 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -76,12 +76,6 @@ struct arm_smmu_option_prop {
> DEFINE_XARRAY_ALLOC1(arm_smmu_asid_xa);
> DEFINE_MUTEX(arm_smmu_asid_lock);
>
> -/*
> - * Special value used by SVA when a process dies, to quiesce a CD without
> - * disabling it.
> - */
> -struct arm_smmu_ctx_desc quiet_cd = { 0 };
> -
> static struct arm_smmu_option_prop arm_smmu_options[] = {
> { ARM_SMMU_OPT_SKIP_PREFETCH, "hisilicon,broken-prefetch-cmd" },
> { ARM_SMMU_OPT_PAGE0_REGS_ONLY, "cavium,cn9900-broken-page1-regspace"},
> @@ -1047,9 +1041,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid,
> * (2) Install a secondary CD, for SID+SSID traffic.
> * (3) Update ASID of a CD. Atomically write the first 64 bits of the
> * CD, then invalidate the old entry and mappings.
> - * (4) Quiesce the context without clearing the valid bit. Disable
> - * translation, and ignore any translation fault.
> - * (5) Remove a secondary CD.
> + * (4) Remove a secondary CD.
> */
> u64 val;
> bool cd_live;
> @@ -1065,10 +1057,8 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain, int ssid,
> val = le64_to_cpu(cdptr[0]);
> cd_live = !!(val & CTXDESC_CD_0_V);
>
> - if (!cd) { /* (5) */
> + if (!cd) { /* (4) */
> val = 0;
> - } else if (cd == &quiet_cd) { /* (4) */
> - val |= CTXDESC_CD_0_TCR_EPD0;
> } else if (cd_live) { /* (3) */
> val &= ~CTXDESC_CD_0_ASID;
> val |= FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid);
Thanks Jean
Have tested, still got some issue with our openssl-engine.
1. If openssl-engine does not register rsa, nginx works well.
2. If openssl-engine register rsa, nginx also works, but ioasid is not
freed when nginx stop.
IMPLEMENT_DYNAMIC_BIND_FN(bind_fn)
bind_fn
ENGINE_set_RSA(e, rsa_methods())
destroy_fn
If ENGINE_set_RSA is set, nginx start and stop will NOT call destroy_fn.
Even rsa_methods is almost new via RSA_meth_new.
In 5.18-rcx, this caused ioasid not freed in nginx start and stop.
In 5.17, though destroy_fn is not called, but ioasid is freed when nginx
stop, so not noticed this issue before.
Still in checking why ENGINE_set_RSA blocks destroy_fn.
Thanks
On Fri, Apr 22, 2022 at 09:15:01PM +0800, [email protected] wrote:
> > I'm trying to piece together what happens from the kernel point of view.
> >
> > * master process with mm A opens a queue fd through uacce, which calls
> > iommu_sva_bind_device(dev, A) -> PASID 1
> >
> > * master forks and exits. Child (daemon) gets mm B, inherits the queue fd.
> > The device is still bound to mm A with PASID 1, since the queue fd is
> > still open.
>
> > We discussed this before, but I don't remember where we left off. The
> > child can't use the queue because its mappings are not copied on fork(),
> > and the queue is still bound to the parent mm A. The child either needs to
> > open a new queue or take ownership of the old one with a new uacce ioctl.
> Yes, currently nginx aligned with the case.
> Child process (worker process) reopen uacce,
>
> Master process (do init) open uacce, iommu_sva_bind_device(dev, A) -> PASID
> 1
> Master process fork Child (daemon) and exit.
>
> Child (daemon) does not use PASID 1 any more, only fork and manage worker
> process.
> Worker process reopen uacce, iommu_sva_bind_device(dev, B) PASID 2
>
> So it is expected.
Yes, that's fine
> > Is that the "IMPLEMENT_DYNAMIC_BIND_FN()" you mention, something out of
> > tree? This operation should unbind from A before binding to B, no?
> > Otherwise we leak PASID 1.
> In 5.16 PASID 1 from master is hold until nginx service stop.
> nginx start
> master:
> iommu_sva_alloc_pasid mm->pasid=1 // master process
>
> lynx https start:
> iommu_sva_alloc_pasid mm->pasid=2 //worker process
>
> nginx stop: from fops_release
> iommu_sva_free_pasid mm->pasid=2 // worker process
> iommu_sva_free_pasid mm->pasid=1 // master process
That's the expected behavior (master could close its fd before forking, in
order to free things up earlier, but it's not mandatory)
> Have one silly question.
>
> kerne driver
> fops_open
> iommu_sva_bind_device
>
> fops_release
> iommu_sva_unbind_device
>
> application
> main()
> fd = open
> return;
>
> Application exit but not close(fd), is it expected fops_release will be
> called automatically by system?
Yes, the application doesn't have to call close() explicitly, the file
descriptor is closed automatically on exit. Note that the fd is copied on
fork(), so it is only released once parent and all child processes exit.
> On 5.17
> fops_release is called automatically, as well as iommu_sva_unbind_device.
> On 5.18-rc1.
> fops_release is not called, have to manually call close(fd)
Right that's weird
> Since nginx may have a issue, it does not call close(fd) when nginx -s quit.
And you're sure that none of the processes are still alive or in zombie
state? Just to cover every possibility.
Thanks,
Jean
Hi, Jean
On 2022/4/22 下午11:50, Jean-Philippe Brucker wrote:
> On Fri, Apr 22, 2022 at 09:15:01PM +0800, [email protected] wrote:
>>> I'm trying to piece together what happens from the kernel point of view.
>>>
>>> * master process with mm A opens a queue fd through uacce, which calls
>>> iommu_sva_bind_device(dev, A) -> PASID 1
>>>
>>> * master forks and exits. Child (daemon) gets mm B, inherits the queue fd.
>>> The device is still bound to mm A with PASID 1, since the queue fd is
>>> still open.
>>> We discussed this before, but I don't remember where we left off. The
>>> child can't use the queue because its mappings are not copied on fork(),
>>> and the queue is still bound to the parent mm A. The child either needs to
>>> open a new queue or take ownership of the old one with a new uacce ioctl.
>> Yes, currently nginx aligned with the case.
>> Child process (worker process) reopen uacce,
>>
>> Master process (do init) open uacce, iommu_sva_bind_device(dev, A) -> PASID
>> 1
>> Master process fork Child (daemon) and exit.
>>
>> Child (daemon) does not use PASID 1 any more, only fork and manage worker
>> process.
>> Worker process reopen uacce, iommu_sva_bind_device(dev, B) PASID 2
>>
>> So it is expected.
> Yes, that's fine
>
>>> Is that the "IMPLEMENT_DYNAMIC_BIND_FN()" you mention, something out of
>>> tree? This operation should unbind from A before binding to B, no?
>>> Otherwise we leak PASID 1.
>> In 5.16 PASID 1 from master is hold until nginx service stop.
>> nginx start
>> master:
>> iommu_sva_alloc_pasid mm->pasid=1 // master process
>>
>> lynx https start:
>> iommu_sva_alloc_pasid mm->pasid=2 //worker process
>>
>> nginx stop: from fops_release
>> iommu_sva_free_pasid mm->pasid=2 // worker process
>> iommu_sva_free_pasid mm->pasid=1 // master process
> That's the expected behavior (master could close its fd before forking, in
> order to free things up earlier, but it's not mandatory)
Currently we unbind in fops_release, so the ioasid allocated in master
can only be freed when nginx stop,
when all forked fd are closed.
>
>> Have one silly question.
>>
>> kerne driver
>> fops_open
>> iommu_sva_bind_device
>>
>> fops_release
>> iommu_sva_unbind_device
>>
>> application
>> main()
>> fd = open
>> return;
>>
>> Application exit but not close(fd), is it expected fops_release will be
>> called automatically by system?
> Yes, the application doesn't have to call close() explicitly, the file
> descriptor is closed automatically on exit. Note that the fd is copied on
> fork(), so it is only released once parent and all child processes exit.
Yes, in case the application ended unexpected, like ctrl+c
>
>> On 5.17
>> fops_release is called automatically, as well as iommu_sva_unbind_device.
>> On 5.18-rc1.
>> fops_release is not called, have to manually call close(fd)
> Right that's weird
Looks it is caused by the fix patch, via mmget, which may add refcount
of fd.
Some experiments
1. 5.17, everything works well.
2. 5.17 + patchset of "iommu/sva: Assign a PASID to mm on PASID
allocation and free it on mm exit"
Test application, exit without close uacce fd
First time: fops_release can be called automatically.
log:
ioasid_alloc ioasid=1
iommu_sva_alloc_pasid pasid=1
iommu_sva_bind_device handle=00000000263a2ee8
ioasid_free ioasid=1
uacce_fops_release q=0000000055ca3cdf
iommu_sva_unbind_device handle=00000000263a2ee8
Second time: hardware reports error
uacce_fops_open q=000000008e4d6f78
ioasid_alloc ioasid=1
iommu_sva_alloc_pasid pasid=1
iommu_sva_bind_device handle=00000000cfd11788
// Haredware reports error
hisi_sec2 0000:b6:00.0: qm_acc_do_task_timeout [error status=0x20] found
hisi_sec2 0000:b6:00.0: qm_acc_wb_not_ready_timeout [error status=0x40]
found
hisi_sec2 0000:b6:00.0: sec_fsm_hbeat_rint [error status=0x20] found
hisi_sec2 0000:b6:00.0: Controller resetting...
hisi_sec2 0000:b6:00.0: QM mailbox operation timeout!
hisi_sec2 0000:b6:00.0: Failed to dump sqc!
hisi_sec2 0000:b6:00.0: Failed to drain out data for stopping!
hisi_sec2 0000:b6:00.0: Bus lock! Please reset system.
hisi_sec2 0000:b6:00.0: Controller reset failed (-110)
hisi_sec2 0000:b6:00.0: controller reset failed (-110)
3. Add the fix patch of using mmget in bind.
Test application, exit without close uacce fd
fops_release can NOT be called automatically, looks mmget adds refcount
of fd.
So the fix method of using mmget blocks fops_release to be called once
fd is closed without unbind.
>> Since nginx may have a issue, it does not call close(fd) when nginx -s quit.
> And you're sure that none of the processes are still alive or in zombie
> state? Just to cover every possibility.
It can also reproduced by a simple application exit without close(uacce_fd)
Thanks
On Sat, 23 Apr 2022 at 19:13, [email protected]
<[email protected]> wrote:
>
> Hi, Jean
>
> On 2022/4/22 下午11:50, Jean-Philippe Brucker wrote:
> > On Fri, Apr 22, 2022 at 09:15:01PM +0800, [email protected] wrote:
> >>> I'm trying to piece together what happens from the kernel point of view.
> >>>
> >>> * master process with mm A opens a queue fd through uacce, which calls
> >>> iommu_sva_bind_device(dev, A) -> PASID 1
> >>>
> >>> * master forks and exits. Child (daemon) gets mm B, inherits the queue fd.
> >>> The device is still bound to mm A with PASID 1, since the queue fd is
> >>> still open.
> >>> We discussed this before, but I don't remember where we left off. The
> >>> child can't use the queue because its mappings are not copied on fork(),
> >>> and the queue is still bound to the parent mm A. The child either needs to
> >>> open a new queue or take ownership of the old one with a new uacce ioctl.
> >> Yes, currently nginx aligned with the case.
> >> Child process (worker process) reopen uacce,
> >>
> >> Master process (do init) open uacce, iommu_sva_bind_device(dev, A) -> PASID
> >> 1
> >> Master process fork Child (daemon) and exit.
> >>
> >> Child (daemon) does not use PASID 1 any more, only fork and manage worker
> >> process.
> >> Worker process reopen uacce, iommu_sva_bind_device(dev, B) PASID 2
> >>
> >> So it is expected.
> > Yes, that's fine
> >
> >>> Is that the "IMPLEMENT_DYNAMIC_BIND_FN()" you mention, something out of
> >>> tree? This operation should unbind from A before binding to B, no?
> >>> Otherwise we leak PASID 1.
> >> In 5.16 PASID 1 from master is hold until nginx service stop.
> >> nginx start
> >> master:
> >> iommu_sva_alloc_pasid mm->pasid=1 // master process
> >>
> >> lynx https start:
> >> iommu_sva_alloc_pasid mm->pasid=2 //worker process
> >>
> >> nginx stop: from fops_release
> >> iommu_sva_free_pasid mm->pasid=2 // worker process
> >> iommu_sva_free_pasid mm->pasid=1 // master process
> > That's the expected behavior (master could close its fd before forking, in
> > order to free things up earlier, but it's not mandatory)
> Currently we unbind in fops_release, so the ioasid allocated in master
> can only be freed when nginx stop,
> when all forked fd are closed.
>
> >
> >> Have one silly question.
> >>
> >> kerne driver
> >> fops_open
> >> iommu_sva_bind_device
> >>
> >> fops_release
> >> iommu_sva_unbind_device
> >>
> >> application
> >> main()
> >> fd = open
> >> return;
> >>
> >> Application exit but not close(fd), is it expected fops_release will be
> >> called automatically by system?
> > Yes, the application doesn't have to call close() explicitly, the file
> > descriptor is closed automatically on exit. Note that the fd is copied on
> > fork(), so it is only released once parent and all child processes exit.
> Yes, in case the application ended unexpected, like ctrl+c
> >
> >> On 5.17
> >> fops_release is called automatically, as well as iommu_sva_unbind_device.
> >> On 5.18-rc1.
> >> fops_release is not called, have to manually call close(fd)
> > Right that's weird
> Looks it is caused by the fix patch, via mmget, which may add refcount
> of fd.
>
> Some experiments
> 1. 5.17, everything works well.
>
> 2. 5.17 + patchset of "iommu/sva: Assign a PASID to mm on PASID
> allocation and free it on mm exit"
>
> Test application, exit without close uacce fd
> First time: fops_release can be called automatically.
>
> log:
> ioasid_alloc ioasid=1
> iommu_sva_alloc_pasid pasid=1
> iommu_sva_bind_device handle=00000000263a2ee8
> ioasid_free ioasid=1
> uacce_fops_release q=0000000055ca3cdf
> iommu_sva_unbind_device handle=00000000263a2ee8
>
> Second time: hardware reports error
>
> uacce_fops_open q=000000008e4d6f78
> ioasid_alloc ioasid=1
> iommu_sva_alloc_pasid pasid=1
> iommu_sva_bind_device handle=00000000cfd11788
> // Haredware reports error
> hisi_sec2 0000:b6:00.0: qm_acc_do_task_timeout [error status=0x20] found
> hisi_sec2 0000:b6:00.0: qm_acc_wb_not_ready_timeout [error status=0x40]
> found
> hisi_sec2 0000:b6:00.0: sec_fsm_hbeat_rint [error status=0x20] found
> hisi_sec2 0000:b6:00.0: Controller resetting...
> hisi_sec2 0000:b6:00.0: QM mailbox operation timeout!
> hisi_sec2 0000:b6:00.0: Failed to dump sqc!
> hisi_sec2 0000:b6:00.0: Failed to drain out data for stopping!
> hisi_sec2 0000:b6:00.0: Bus lock! Please reset system.
> hisi_sec2 0000:b6:00.0: Controller reset failed (-110)
> hisi_sec2 0000:b6:00.0: controller reset failed (-110)
>
> 3. Add the fix patch of using mmget in bind.
> Test application, exit without close uacce fd
>
> fops_release can NOT be called automatically, looks mmget adds refcount
> of fd.
Test application, exit without closing fd.
> >> kernel driver
> >> fops_open
> >> iommu_sva_bind_device
> >>
> >> fops_release
> >> iommu_sva_unbind_device
1.
5.17 kernel, no mmget & mmput
wd_release_queue no close
Compress bz=512000 nb=1×10, speed=139.5 MB/s (±0.0% N=1) overall=122.9
MB/s (±0.0%)
[ 16.052989] do_exit current=d380000
[ 16.053828] mmput atomic=1
[ 16.054511] __mmput atomic=0
[ 16.070382] exit_task_work
[ 16.070981] uacce_fops_release current=d380000
[ 16.071999] CPU: 0 PID: 176 Comm: test_sva_perf Not tainted
5.16.0-rc1-27342-ge5f9f3f99a88-dirty #240
[ 16.074007] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
[ 16.075530] Call trace:
[ 16.076069] dump_backtrace+0x0/0x1a0
[ 16.076887] show_stack+0x20/0x30
[ 16.077629] dump_stack_lvl+0x8c/0xb8
[ 16.078441] dump_stack+0x18/0x34
[ 16.079176] uacce_fops_release+0x44/0xdc
[ 16.080060] __fput+0x78/0x240
[ 16.080743] ____fput+0x18/0x28
[ 16.081447] task_work_run+0x88/0x160
[ 16.082259] do_exit+0x52c/0xa50
[ 16.082974] do_group_exit+0x84/0xa8
[ 16.083768] __wake_up_parent+0x0/0x38
[ 16.084597] invoke_syscall+0x4c/0x110
[ 16.085435] el0_svc_common.constprop.0+0x68/0x128
[ 16.086501] do_el0_svc+0x2c/0x90
[ 16.087243] el0_svc+0x24/0x70
[ 16.087928] el0t_64_sync_handler+0xb0/0xb8
[ 16.088854] el0t_64_sync+0x1a0/0x1a4
[ 16.089775] arm_smmu_sva_unbind
[ 16.090577] iommu_sva_free_pasid mm->pasid=1
[ 16.091763] exit_task_work done
2. Add mmget in bind and mmput in unbind,
Since application do not close fd, so no unbind,& mmput
And fops_release is not called since mm_users account.
log:
[ 101.642690] mmput atomic=3
wd_release_queue no close
Compress bz=512000 nb=1×10, speed=40.3 MB/s (±0.0% N=1) overall=38.7
MB/s (±0.0%)
[ 101.671167] do_exit current=d9daf40
[ 101.672003] mmput atomic=2
[ 101.672712] exit_task_work
[ 101.673292] exit_task_work done
Thanks
>
> So the fix method of using mmget blocks fops_release to be called once
> fd is closed without unbind.
>
> >> Since nginx may have a issue, it does not call close(fd) when nginx -s quit.
> > And you're sure that none of the processes are still alive or in zombie
> > state? Just to cover every possibility.
> It can also reproduced by a simple application exit without close(uacce_fd)
>
> Thanks
>
Hi, Jean & Fenghua
The issue of "iommu/sva: Assign a PASID to mm on PASID allocation and
free it on mm exit"
is mm_pasid_drop in __mmput only ioasid_free(mm->pasid), but keep all
related resources,
like cd tables etc.
This introduces many strange issues.
For example, if application does not use close fd but exit directly,
mm_pasid_drop is called first,
then fops_release->unbind is called, when mm->pasid=-1,
arm_smmu_write_ctx_desc will get error.
And in nginx case, pasid is freed when fork daemon, but cd table is
still there,
then next time, same pasid is allocated.
So either __mmput free pasid as well as all related resources like cd
table,
or convert back to rely on the original unbind method to free pasid and
resources together.
Since SVA is main feature on ARM, which has been developed for years,
It is already used in the product.
It will be horrible if SVA is broke on 5.18.
Any suggestion?
Thanks
On 2022/4/24 上午10:58, Zhangfei Gao wrote:
> On Sat, 23 Apr 2022 at 19:13, [email protected]
> <[email protected]> wrote:
>> Hi, Jean
>>
>> On 2022/4/22 下午11:50, Jean-Philippe Brucker wrote:
>>> On Fri, Apr 22, 2022 at 09:15:01PM +0800, [email protected] wrote:
>>>>> I'm trying to piece together what happens from the kernel point of view.
>>>>>
>>>>> * master process with mm A opens a queue fd through uacce, which calls
>>>>> iommu_sva_bind_device(dev, A) -> PASID 1
>>>>>
>>>>> * master forks and exits. Child (daemon) gets mm B, inherits the queue fd.
>>>>> The device is still bound to mm A with PASID 1, since the queue fd is
>>>>> still open.
>>>>> We discussed this before, but I don't remember where we left off. The
>>>>> child can't use the queue because its mappings are not copied on fork(),
>>>>> and the queue is still bound to the parent mm A. The child either needs to
>>>>> open a new queue or take ownership of the old one with a new uacce ioctl.
>>>> Yes, currently nginx aligned with the case.
>>>> Child process (worker process) reopen uacce,
>>>>
>>>> Master process (do init) open uacce, iommu_sva_bind_device(dev, A) -> PASID
>>>> 1
>>>> Master process fork Child (daemon) and exit.
>>>>
>>>> Child (daemon) does not use PASID 1 any more, only fork and manage worker
>>>> process.
>>>> Worker process reopen uacce, iommu_sva_bind_device(dev, B) PASID 2
>>>>
>>>> So it is expected.
>>> Yes, that's fine
>>>
>>>>> Is that the "IMPLEMENT_DYNAMIC_BIND_FN()" you mention, something out of
>>>>> tree? This operation should unbind from A before binding to B, no?
>>>>> Otherwise we leak PASID 1.
>>>> In 5.16 PASID 1 from master is hold until nginx service stop.
>>>> nginx start
>>>> master:
>>>> iommu_sva_alloc_pasid mm->pasid=1 // master process
>>>>
>>>> lynx https start:
>>>> iommu_sva_alloc_pasid mm->pasid=2 //worker process
>>>>
>>>> nginx stop: from fops_release
>>>> iommu_sva_free_pasid mm->pasid=2 // worker process
>>>> iommu_sva_free_pasid mm->pasid=1 // master process
>>> That's the expected behavior (master could close its fd before forking, in
>>> order to free things up earlier, but it's not mandatory)
>> Currently we unbind in fops_release, so the ioasid allocated in master
>> can only be freed when nginx stop,
>> when all forked fd are closed.
>>
>>>> Have one silly question.
>>>>
>>>> kerne driver
>>>> fops_open
>>>> iommu_sva_bind_device
>>>>
>>>> fops_release
>>>> iommu_sva_unbind_device
>>>>
>>>> application
>>>> main()
>>>> fd = open
>>>> return;
>>>>
>>>> Application exit but not close(fd), is it expected fops_release will be
>>>> called automatically by system?
>>> Yes, the application doesn't have to call close() explicitly, the file
>>> descriptor is closed automatically on exit. Note that the fd is copied on
>>> fork(), so it is only released once parent and all child processes exit.
>> Yes, in case the application ended unexpected, like ctrl+c
>>>> On 5.17
>>>> fops_release is called automatically, as well as iommu_sva_unbind_device.
>>>> On 5.18-rc1.
>>>> fops_release is not called, have to manually call close(fd)
>>> Right that's weird
>> Looks it is caused by the fix patch, via mmget, which may add refcount
>> of fd.
>>
>> Some experiments
>> 1. 5.17, everything works well.
>>
>> 2. 5.17 + patchset of "iommu/sva: Assign a PASID to mm on PASID
>> allocation and free it on mm exit"
>>
>> Test application, exit without close uacce fd
>> First time: fops_release can be called automatically.
>>
>> log:
>> ioasid_alloc ioasid=1
>> iommu_sva_alloc_pasid pasid=1
>> iommu_sva_bind_device handle=00000000263a2ee8
>> ioasid_free ioasid=1
>> uacce_fops_release q=0000000055ca3cdf
>> iommu_sva_unbind_device handle=00000000263a2ee8
>>
>> Second time: hardware reports error
>>
>> uacce_fops_open q=000000008e4d6f78
>> ioasid_alloc ioasid=1
>> iommu_sva_alloc_pasid pasid=1
>> iommu_sva_bind_device handle=00000000cfd11788
>> // Haredware reports error
>> hisi_sec2 0000:b6:00.0: qm_acc_do_task_timeout [error status=0x20] found
>> hisi_sec2 0000:b6:00.0: qm_acc_wb_not_ready_timeout [error status=0x40]
>> found
>> hisi_sec2 0000:b6:00.0: sec_fsm_hbeat_rint [error status=0x20] found
>> hisi_sec2 0000:b6:00.0: Controller resetting...
>> hisi_sec2 0000:b6:00.0: QM mailbox operation timeout!
>> hisi_sec2 0000:b6:00.0: Failed to dump sqc!
>> hisi_sec2 0000:b6:00.0: Failed to drain out data for stopping!
>> hisi_sec2 0000:b6:00.0: Bus lock! Please reset system.
>> hisi_sec2 0000:b6:00.0: Controller reset failed (-110)
>> hisi_sec2 0000:b6:00.0: controller reset failed (-110)
>>
>> 3. Add the fix patch of using mmget in bind.
>> Test application, exit without close uacce fd
>>
>> fops_release can NOT be called automatically, looks mmget adds refcount
>> of fd.
> Test application, exit without closing fd.
>>>> kernel driver
>>>> fops_open
>>>> iommu_sva_bind_device
>>>>
>>>> fops_release
>>>> iommu_sva_unbind_device
> 1.
> 5.17 kernel, no mmget & mmput
>
> wd_release_queue no close
> Compress bz=512000 nb=1×10, speed=139.5 MB/s (±0.0% N=1) overall=122.9
> MB/s (±0.0%)
> [ 16.052989] do_exit current=d380000
> [ 16.053828] mmput atomic=1
> [ 16.054511] __mmput atomic=0
> [ 16.070382] exit_task_work
> [ 16.070981] uacce_fops_release current=d380000
> [ 16.071999] CPU: 0 PID: 176 Comm: test_sva_perf Not tainted
> 5.16.0-rc1-27342-ge5f9f3f99a88-dirty #240
> [ 16.074007] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
> [ 16.075530] Call trace:
> [ 16.076069] dump_backtrace+0x0/0x1a0
> [ 16.076887] show_stack+0x20/0x30
> [ 16.077629] dump_stack_lvl+0x8c/0xb8
> [ 16.078441] dump_stack+0x18/0x34
> [ 16.079176] uacce_fops_release+0x44/0xdc
> [ 16.080060] __fput+0x78/0x240
> [ 16.080743] ____fput+0x18/0x28
> [ 16.081447] task_work_run+0x88/0x160
> [ 16.082259] do_exit+0x52c/0xa50
> [ 16.082974] do_group_exit+0x84/0xa8
> [ 16.083768] __wake_up_parent+0x0/0x38
> [ 16.084597] invoke_syscall+0x4c/0x110
> [ 16.085435] el0_svc_common.constprop.0+0x68/0x128
> [ 16.086501] do_el0_svc+0x2c/0x90
> [ 16.087243] el0_svc+0x24/0x70
> [ 16.087928] el0t_64_sync_handler+0xb0/0xb8
> [ 16.088854] el0t_64_sync+0x1a0/0x1a4
> [ 16.089775] arm_smmu_sva_unbind
> [ 16.090577] iommu_sva_free_pasid mm->pasid=1
> [ 16.091763] exit_task_work done
>
> 2. Add mmget in bind and mmput in unbind,
> Since application do not close fd, so no unbind,& mmput
> And fops_release is not called since mm_users account.
>
> log:
> [ 101.642690] mmput atomic=3
> wd_release_queue no close
> Compress bz=512000 nb=1×10, speed=40.3 MB/s (±0.0% N=1) overall=38.7
> MB/s (±0.0%)
> [ 101.671167] do_exit current=d9daf40
> [ 101.672003] mmput atomic=2
> [ 101.672712] exit_task_work
> [ 101.673292] exit_task_work done
>
> Thanks
>
>
>
>> So the fix method of using mmget blocks fops_release to be called once
>> fd is closed without unbind.
>>
>>>> Since nginx may have a issue, it does not call close(fd) when nginx -s quit.
>>> And you're sure that none of the processes are still alive or in zombie
>>> state? Just to cover every possibility.
>> It can also reproduced by a simple application exit without close(uacce_fd)
>>
>> Thanks
>>
Hi, Dave
On 2022/4/12 下午11:35, Dave Hansen wrote:
> On 4/12/22 08:10, Jean-Philippe Brucker wrote:
>>> I wonder if the Intel and ARM IOMMU code differ in the way they keep
>>> references to the mm, or if this affects Intel as well, but we just
>>> haven't tested the code enough.
>> The Arm code was written expecting the PASID to be freed on unbind(), not
>> mm exit. I missed the change of behavior, sorry (I thought your plan was
>> to extend PASID lifetime, not shorten it?) but as is it seems very broken.
>> For example in the iommu_sva_unbind_device(), we have
>> arm_smmu_mmu_notifier_put() clearing the PASID table entry for
>> "mm->pasid", which is going to end badly if the PASID has been cleared or
>> reallocated. We can't clear the PASID entry in mm exit because at that
>> point the device may still be issuing DMA for that PASID and we need to
>> quiesce the entry rather than deactivate it.
> I think we ended up flipping some of this around on the Intel side.
> Instead of having to quiesce the device on mm exit, we don't let the mm
> exit until the device is done.
>
> When you program the pasid into the device, it's a lot like when you
> create a thread. We bump the reference count on the mm when we program
> the page table pointer into a CPU. We drop the thread's reference to
> the mm when the thread exits and will no longer be using the page tables.
>
> Same thing with pasids. We bump the refcount on the mm when the pasid
> is programmed into the device. Once the device is done with the mm, we
> drop the mm.
>
> Basically, instead of recounting the pasid itself, we just refcount the mm.
This has issue, since refcount the mm will block fops_release to be
called, where unbind may really happen.
For example, user driver are ended unexpectedly,
usually system will end all applications via close fd -> fops_release ->
unbind may happen here.
Now mmget is called, fops_release -> unbind has NO chance to be called,
so ioasid can NOT be freed.
Thanks
On 4/25/22 06:53, Jean-Philippe Brucker wrote:
> On Sat, Apr 23, 2022 at 07:13:39PM +0800, [email protected] wrote:
>>>> On 5.17
>>>> fops_release is called automatically, as well as iommu_sva_unbind_device.
>>>> On 5.18-rc1.
>>>> fops_release is not called, have to manually call close(fd)
>>> Right that's weird
>> Looks it is caused by the fix patch, via mmget, which may add refcount of
>> fd.
> Yes indirectly I think: when the process mmaps the queue, mmap_region()
> takes a reference to the uacce fd. That reference is released either by
> explicit close() or munmap(), or by exit_mmap() (which is triggered by
> mmput()). Since there is an mm->fd dependency, we cannot add a fd->mm
> dependency, so no mmget()/mmput() in bind()/unbind().
>
> I guess we should go back to refcounted PASIDs instead, to avoid freeing
> them until unbind().
Yeah, this is a bit gnarly for -rc4. Let's just make sure there's
nothing else simple we can do.
How does the IOMMU hardware know that all activity to a given PASID is
finished? That activity should, today, be independent of an mm or a
fd's lifetime.
On Mon, Apr 25, 2022 at 07:18:36AM -0700, Dave Hansen wrote:
> On 4/25/22 06:53, Jean-Philippe Brucker wrote:
> > On Sat, Apr 23, 2022 at 07:13:39PM +0800, [email protected] wrote:
> >>>> On 5.17
> >>>> fops_release is called automatically, as well as iommu_sva_unbind_device.
> >>>> On 5.18-rc1.
> >>>> fops_release is not called, have to manually call close(fd)
> >>> Right that's weird
> >> Looks it is caused by the fix patch, via mmget, which may add refcount of
> >> fd.
> > Yes indirectly I think: when the process mmaps the queue, mmap_region()
> > takes a reference to the uacce fd. That reference is released either by
> > explicit close() or munmap(), or by exit_mmap() (which is triggered by
> > mmput()). Since there is an mm->fd dependency, we cannot add a fd->mm
> > dependency, so no mmget()/mmput() in bind()/unbind().
> >
> > I guess we should go back to refcounted PASIDs instead, to avoid freeing
> > them until unbind().
>
> Yeah, this is a bit gnarly for -rc4. Let's just make sure there's
> nothing else simple we can do.
>
> How does the IOMMU hardware know that all activity to a given PASID is
> finished? That activity should, today, be independent of an mm or a
> fd's lifetime.
In the case of uacce, it's tied to the fd lifetime: opening an accelerator
queue calls iommu_sva_bind_device(), which sets up the PASID context in
the IOMMU. Closing the queue calls iommu_sva_unbind_device() which
destroys the PASID context (after the device driver stopped all DMA for
this PASID).
Thanks,
Jean
On Sat, Apr 23, 2022 at 07:13:39PM +0800, [email protected] wrote:
> > > On 5.17
> > > fops_release is called automatically, as well as iommu_sva_unbind_device.
> > > On 5.18-rc1.
> > > fops_release is not called, have to manually call close(fd)
> > Right that's weird
> Looks it is caused by the fix patch, via mmget, which may add refcount of
> fd.
Yes indirectly I think: when the process mmaps the queue, mmap_region()
takes a reference to the uacce fd. That reference is released either by
explicit close() or munmap(), or by exit_mmap() (which is triggered by
mmput()). Since there is an mm->fd dependency, we cannot add a fd->mm
dependency, so no mmget()/mmput() in bind()/unbind().
I guess we should go back to refcounted PASIDs instead, to avoid freeing
them until unbind().
Thanks,
Jean
Hi Jacob,
On Mon, Apr 25, 2022 at 08:34:44AM -0700, Jacob Pan wrote:
> Hi Jean-Philippe,
>
> On Mon, 25 Apr 2022 15:26:40 +0100, Jean-Philippe Brucker
> <[email protected]> wrote:
>
> > On Mon, Apr 25, 2022 at 07:18:36AM -0700, Dave Hansen wrote:
> > > On 4/25/22 06:53, Jean-Philippe Brucker wrote:
> > > > On Sat, Apr 23, 2022 at 07:13:39PM +0800, [email protected]
> > > > wrote:
> > > >>>> On 5.17
> > > >>>> fops_release is called automatically, as well as
> > > >>>> iommu_sva_unbind_device. On 5.18-rc1.
> > > >>>> fops_release is not called, have to manually call close(fd)
> > > >>> Right that's weird
> > > >> Looks it is caused by the fix patch, via mmget, which may add
> > > >> refcount of fd.
> > > > Yes indirectly I think: when the process mmaps the queue,
> > > > mmap_region() takes a reference to the uacce fd. That reference is
> > > > released either by explicit close() or munmap(), or by exit_mmap()
> > > > (which is triggered by mmput()). Since there is an mm->fd dependency,
> > > > we cannot add a fd->mm dependency, so no mmget()/mmput() in
> > > > bind()/unbind().
> > > >
> > > > I guess we should go back to refcounted PASIDs instead, to avoid
> > > > freeing them until unbind().
> > >
> > > Yeah, this is a bit gnarly for -rc4. Let's just make sure there's
> > > nothing else simple we can do.
> > >
> > > How does the IOMMU hardware know that all activity to a given PASID is
> > > finished? That activity should, today, be independent of an mm or a
> > > fd's lifetime.
> >
> > In the case of uacce, it's tied to the fd lifetime: opening an accelerator
> > queue calls iommu_sva_bind_device(), which sets up the PASID context in
> > the IOMMU. Closing the queue calls iommu_sva_unbind_device() which
> > destroys the PASID context (after the device driver stopped all DMA for
> > this PASID).
> >
> For VT-d, it is essentially the same flow except managed by the individual
> drivers such as DSA.
> If free() happens before unbind(), we deactivate the PASIDs and suppress
> faults from the device. When the unbind finally comes, we finalize the
> PASID teardown. It seems we have a need for an intermediate state where
> PASID is "pending free"?
Yes we do have that state, though I'm not sure we need to make it explicit
in the ioasid allocator.
Could we move mm_pasid_drop() to __mmdrop() instead of __mmput()? For Arm
we do need to hold the mm_count until unbind(), and mmgrab()/mmdrop() is
also part of Lu's rework [1].
Thanks,
Jean
[1] https://lore.kernel.org/linux-iommu/[email protected]/
On Mon, Apr 25, 2022 at 08:55:46AM -0700, Dave Hansen wrote:
> On 4/25/22 07:26, Jean-Philippe Brucker wrote:
> >>
> >> How does the IOMMU hardware know that all activity to a given PASID is
> >> finished? That activity should, today, be independent of an mm or a
> >> fd's lifetime.
> > In the case of uacce, it's tied to the fd lifetime: opening an accelerator
> > queue calls iommu_sva_bind_device(), which sets up the PASID context in
> > the IOMMU. Closing the queue calls iommu_sva_unbind_device() which
> > destroys the PASID context (after the device driver stopped all DMA for
> > this PASID).
>
> Could this PASID context destruction move from being "fd-based" to
> happening under mm_pasid_drop()? Logically, it seems like that should
> work because mm_pasid_drop() happens after exit_mmap() where the VMAs
> (which hold references to 'struct file' via vma->vm_file) are torn down.
The problem is that we'd have to request the device driver to stop DMA
before we can destroy the context and free the PASID. We did consider
doing this in the release() MMU notifier, but there were concerns about
blocking mmput() for too long (for example
https://lore.kernel.org/linux-iommu/[email protected]/
though I think there was a more recent discussion). We also need to drain
the PRI and fault queues to get rid of all references to that PASID.
At the moment we disable (but not destroy) the PASID context in release(),
so when the process gets killed pending DMA transactions are silently
ignored. Then the device driver informs us through unbind() that no DMA is
active anymore and we can finish cleaning up, then reuse the PASID.
Thanks,
Jean
On Tue, Apr 26, 2022 at 12:28:00PM +0800, Zhangfei Gao wrote:
> Hi, Jean
>
> On 2022/4/26 上午12:13, Jean-Philippe Brucker wrote:
> > Hi Jacob,
> >
> > On Mon, Apr 25, 2022 at 08:34:44AM -0700, Jacob Pan wrote:
> > > Hi Jean-Philippe,
> > >
> > > On Mon, 25 Apr 2022 15:26:40 +0100, Jean-Philippe Brucker
> > > <[email protected]> wrote:
> > >
> > > > On Mon, Apr 25, 2022 at 07:18:36AM -0700, Dave Hansen wrote:
> > > > > On 4/25/22 06:53, Jean-Philippe Brucker wrote:
> > > > > > On Sat, Apr 23, 2022 at 07:13:39PM +0800, [email protected]
> > > > > > wrote:
> > > > > > > > > On 5.17
> > > > > > > > > fops_release is called automatically, as well as
> > > > > > > > > iommu_sva_unbind_device. On 5.18-rc1.
> > > > > > > > > fops_release is not called, have to manually call close(fd)
> > > > > > > > Right that's weird
> > > > > > > Looks it is caused by the fix patch, via mmget, which may add
> > > > > > > refcount of fd.
> > > > > > Yes indirectly I think: when the process mmaps the queue,
> > > > > > mmap_region() takes a reference to the uacce fd. That reference is
> > > > > > released either by explicit close() or munmap(), or by exit_mmap()
> > > > > > (which is triggered by mmput()). Since there is an mm->fd dependency,
> > > > > > we cannot add a fd->mm dependency, so no mmget()/mmput() in
> > > > > > bind()/unbind().
> > > > > >
> > > > > > I guess we should go back to refcounted PASIDs instead, to avoid
> > > > > > freeing them until unbind().
> > > > > Yeah, this is a bit gnarly for -rc4. Let's just make sure there's
> > > > > nothing else simple we can do.
> > > > >
> > > > > How does the IOMMU hardware know that all activity to a given PASID is
> > > > > finished? That activity should, today, be independent of an mm or a
> > > > > fd's lifetime.
> > > > In the case of uacce, it's tied to the fd lifetime: opening an accelerator
> > > > queue calls iommu_sva_bind_device(), which sets up the PASID context in
> > > > the IOMMU. Closing the queue calls iommu_sva_unbind_device() which
> > > > destroys the PASID context (after the device driver stopped all DMA for
> > > > this PASID).
> > > >
> > > For VT-d, it is essentially the same flow except managed by the individual
> > > drivers such as DSA.
> > > If free() happens before unbind(), we deactivate the PASIDs and suppress
> > > faults from the device. When the unbind finally comes, we finalize the
> > > PASID teardown. It seems we have a need for an intermediate state where
> > > PASID is "pending free"?
> > Yes we do have that state, though I'm not sure we need to make it explicit
> > in the ioasid allocator.
> >
> > Could we move mm_pasid_drop() to __mmdrop() instead of __mmput()? For Arm
> > we do need to hold the mm_count until unbind(), and mmgrab()/mmdrop() is
> > also part of Lu's rework [1].
>
> Move mm_pasid_drop to __mmdrop looks workable.
>
> The nginx works since ioasid is not freed when master exit until nginx stop.
>
> The ioasid does not free immediately when fops_release->unbind finished.
> Instead, __mmdrop happens a bit lazy, which has no issue though
> I passed 10000 times exit without unbind test, the pasid allocation is ok.
>
> Thanks
>
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 9796897560ab..60f417f69367 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -792,6 +792,8 @@ void __mmdrop(struct mm_struct *mm)
> mmu_notifier_subscriptions_destroy(mm);
> check_mm(mm);
> put_user_ns(mm->user_ns);
> + mm_pasid_drop(mm);
> free_mm(mm);
> }
> EXPORT_SYMBOL_GPL(__mmdrop);
> @@ -1190,7 +1192,6 @@ static inline void __mmput(struct mm_struct *mm)
> }
> if (mm->binfmt)
> module_put(mm->binfmt->module);
> - mm_pasid_drop(mm);
> mmdrop(mm);
> }
Thank you very much, Zhangfei!
I just now sent out an identical patch. It works on X86 as well.
So seems the patch is the right fix.
Either you can send out the patch or I add your Signed-off-by? Either way
is OK for me.
Thanks.
-Fenghua
Hi Jean-Philippe,
On Mon, 25 Apr 2022 15:26:40 +0100, Jean-Philippe Brucker
<[email protected]> wrote:
> On Mon, Apr 25, 2022 at 07:18:36AM -0700, Dave Hansen wrote:
> > On 4/25/22 06:53, Jean-Philippe Brucker wrote:
> > > On Sat, Apr 23, 2022 at 07:13:39PM +0800, [email protected]
> > > wrote:
> > >>>> On 5.17
> > >>>> fops_release is called automatically, as well as
> > >>>> iommu_sva_unbind_device. On 5.18-rc1.
> > >>>> fops_release is not called, have to manually call close(fd)
> > >>> Right that's weird
> > >> Looks it is caused by the fix patch, via mmget, which may add
> > >> refcount of fd.
> > > Yes indirectly I think: when the process mmaps the queue,
> > > mmap_region() takes a reference to the uacce fd. That reference is
> > > released either by explicit close() or munmap(), or by exit_mmap()
> > > (which is triggered by mmput()). Since there is an mm->fd dependency,
> > > we cannot add a fd->mm dependency, so no mmget()/mmput() in
> > > bind()/unbind().
> > >
> > > I guess we should go back to refcounted PASIDs instead, to avoid
> > > freeing them until unbind().
> >
> > Yeah, this is a bit gnarly for -rc4. Let's just make sure there's
> > nothing else simple we can do.
> >
> > How does the IOMMU hardware know that all activity to a given PASID is
> > finished? That activity should, today, be independent of an mm or a
> > fd's lifetime.
>
> In the case of uacce, it's tied to the fd lifetime: opening an accelerator
> queue calls iommu_sva_bind_device(), which sets up the PASID context in
> the IOMMU. Closing the queue calls iommu_sva_unbind_device() which
> destroys the PASID context (after the device driver stopped all DMA for
> this PASID).
>
For VT-d, it is essentially the same flow except managed by the individual
drivers such as DSA.
If free() happens before unbind(), we deactivate the PASIDs and suppress
faults from the device. When the unbind finally comes, we finalize the
PASID teardown. It seems we have a need for an intermediate state where
PASID is "pending free"?
> Thanks,
> Jean
> _______________________________________________
> iommu mailing list
> [email protected]
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
Thanks,
Jacob
On 4/25/22 07:26, Jean-Philippe Brucker wrote:
>>
>> How does the IOMMU hardware know that all activity to a given PASID is
>> finished? That activity should, today, be independent of an mm or a
>> fd's lifetime.
> In the case of uacce, it's tied to the fd lifetime: opening an accelerator
> queue calls iommu_sva_bind_device(), which sets up the PASID context in
> the IOMMU. Closing the queue calls iommu_sva_unbind_device() which
> destroys the PASID context (after the device driver stopped all DMA for
> this PASID).
Could this PASID context destruction move from being "fd-based" to
happening under mm_pasid_drop()? Logically, it seems like that should
work because mm_pasid_drop() happens after exit_mmap() where the VMAs
(which hold references to 'struct file' via vma->vm_file) are torn down.
Hi, Jean
On 2022/4/26 上午12:13, Jean-Philippe Brucker wrote:
> Hi Jacob,
>
> On Mon, Apr 25, 2022 at 08:34:44AM -0700, Jacob Pan wrote:
>> Hi Jean-Philippe,
>>
>> On Mon, 25 Apr 2022 15:26:40 +0100, Jean-Philippe Brucker
>> <[email protected]> wrote:
>>
>>> On Mon, Apr 25, 2022 at 07:18:36AM -0700, Dave Hansen wrote:
>>>> On 4/25/22 06:53, Jean-Philippe Brucker wrote:
>>>>> On Sat, Apr 23, 2022 at 07:13:39PM +0800, [email protected]
>>>>> wrote:
>>>>>>>> On 5.17
>>>>>>>> fops_release is called automatically, as well as
>>>>>>>> iommu_sva_unbind_device. On 5.18-rc1.
>>>>>>>> fops_release is not called, have to manually call close(fd)
>>>>>>> Right that's weird
>>>>>> Looks it is caused by the fix patch, via mmget, which may add
>>>>>> refcount of fd.
>>>>> Yes indirectly I think: when the process mmaps the queue,
>>>>> mmap_region() takes a reference to the uacce fd. That reference is
>>>>> released either by explicit close() or munmap(), or by exit_mmap()
>>>>> (which is triggered by mmput()). Since there is an mm->fd dependency,
>>>>> we cannot add a fd->mm dependency, so no mmget()/mmput() in
>>>>> bind()/unbind().
>>>>>
>>>>> I guess we should go back to refcounted PASIDs instead, to avoid
>>>>> freeing them until unbind().
>>>> Yeah, this is a bit gnarly for -rc4. Let's just make sure there's
>>>> nothing else simple we can do.
>>>>
>>>> How does the IOMMU hardware know that all activity to a given PASID is
>>>> finished? That activity should, today, be independent of an mm or a
>>>> fd's lifetime.
>>> In the case of uacce, it's tied to the fd lifetime: opening an accelerator
>>> queue calls iommu_sva_bind_device(), which sets up the PASID context in
>>> the IOMMU. Closing the queue calls iommu_sva_unbind_device() which
>>> destroys the PASID context (after the device driver stopped all DMA for
>>> this PASID).
>>>
>> For VT-d, it is essentially the same flow except managed by the individual
>> drivers such as DSA.
>> If free() happens before unbind(), we deactivate the PASIDs and suppress
>> faults from the device. When the unbind finally comes, we finalize the
>> PASID teardown. It seems we have a need for an intermediate state where
>> PASID is "pending free"?
> Yes we do have that state, though I'm not sure we need to make it explicit
> in the ioasid allocator.
>
> Could we move mm_pasid_drop() to __mmdrop() instead of __mmput()? For Arm
> we do need to hold the mm_count until unbind(), and mmgrab()/mmdrop() is
> also part of Lu's rework [1].
Move mm_pasid_drop to __mmdrop looks workable.
The nginx works since ioasid is not freed when master exit until nginx stop.
The ioasid does not free immediately when fops_release->unbind finished.
Instead, __mmdrop happens a bit lazy, which has no issue though
I passed 10000 times exit without unbind test, the pasid allocation is ok.
Thanks
diff --git a/kernel/fork.c b/kernel/fork.c
index 9796897560ab..60f417f69367 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -792,6 +792,8 @@ void __mmdrop(struct mm_struct *mm)
mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
+ mm_pasid_drop(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1190,7 +1192,6 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
- mm_pasid_drop(mm);
mmdrop(mm);
}
>
> Thanks,
> Jean
>
> [1] https://lore.kernel.org/linux-iommu/[email protected]/
Hi, Jean and Zhangfei,
On Mon, Apr 25, 2022 at 05:13:02PM +0100, Jean-Philippe Brucker wrote:
> Could we move mm_pasid_drop() to __mmdrop() instead of __mmput()? For Arm
> we do need to hold the mm_count until unbind(), and mmgrab()/mmdrop() is
> also part of Lu's rework [1].
Is this a right fix for the issue? Could you please test it on ARM?
I don't have an ARM machine.
Thanks.
-Fenghua
From 84aa68f6174439d863c40cdc2db0e1b89d620dd0 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <[email protected]>
Date: Fri, 15 Apr 2022 00:51:33 -0700
Subject: [PATCH] iommu/sva: Fix PASID use-after-free issue
A PASID might be still used on ARM after it is freed in __mmput().
process:
open()->sva_bind()->ioasid_alloc() = N; // Get PASID N for the mm
exit();
exit_mm()->__mmput()->mm_pasid_drop()->mm->pasid = -1; // PASID -1
exit_files()->release(dev)->sva_unbind()->use mm->pasid; // Failure
To avoid the use-after-free issue, free the PASID after no device uses it,
i.e. after all devices are unbound from the mm.
sva_bind()/sva_unbind() call mmgrab()/mmdrop() to track mm->mm_count.
__mmdrop() is called only after mm->mm_count is zero. So freeing the PASID
in __mmdrop() guarantees the PASID is safely freed only after no device
is bound to the mm.
Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm exit")
Reported-by: Zhangfei Gao <[email protected]>
Suggested-by: Jean-Philippe Brucker <[email protected]>
Suggested-by: Jacob Pan <[email protected]>
Signed-off-by: Fenghua Yu <[email protected]>
---
kernel/fork.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/fork.c b/kernel/fork.c
index 9796897560ab..35a3beff140b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -792,6 +792,7 @@ void __mmdrop(struct mm_struct *mm)
mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
+ mm_pasid_drop(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1190,7 +1191,6 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
- mm_pasid_drop(mm);
mmdrop(mm);
}
--
2.32.0
On 2022/4/26 下午12:20, Fenghua Yu wrote:
> Hi, Jean and Zhangfei,
>
> On Mon, Apr 25, 2022 at 05:13:02PM +0100, Jean-Philippe Brucker wrote:
>> Could we move mm_pasid_drop() to __mmdrop() instead of __mmput()? For Arm
>> we do need to hold the mm_count until unbind(), and mmgrab()/mmdrop() is
>> also part of Lu's rework [1].
> Is this a right fix for the issue? Could you please test it on ARM?
> I don't have an ARM machine.
>
> Thanks.
>
> -Fenghua
>
> From 84aa68f6174439d863c40cdc2db0e1b89d620dd0 Mon Sep 17 00:00:00 2001
> From: Fenghua Yu <[email protected]>
> Date: Fri, 15 Apr 2022 00:51:33 -0700
> Subject: [PATCH] iommu/sva: Fix PASID use-after-free issue
>
> A PASID might be still used on ARM after it is freed in __mmput().
>
> process:
> open()->sva_bind()->ioasid_alloc() = N; // Get PASID N for the mm
> exit();
> exit_mm()->__mmput()->mm_pasid_drop()->mm->pasid = -1; // PASID -1
> exit_files()->release(dev)->sva_unbind()->use mm->pasid; // Failure
>
> To avoid the use-after-free issue, free the PASID after no device uses it,
> i.e. after all devices are unbound from the mm.
>
> sva_bind()/sva_unbind() call mmgrab()/mmdrop() to track mm->mm_count.
> __mmdrop() is called only after mm->mm_count is zero. So freeing the PASID
> in __mmdrop() guarantees the PASID is safely freed only after no device
> is bound to the mm.
>
> Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm exit")
>
> Reported-by: Zhangfei Gao <[email protected]>
> Suggested-by: Jean-Philippe Brucker <[email protected]>
> Suggested-by: Jacob Pan <[email protected]>
> Signed-off-by: Fenghua Yu <[email protected]>
Thanks for the fix.
Tested-by: Zhangfei Gao <[email protected]>
> ---
> kernel/fork.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 9796897560ab..35a3beff140b 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -792,6 +792,7 @@ void __mmdrop(struct mm_struct *mm)
> mmu_notifier_subscriptions_destroy(mm);
> check_mm(mm);
> put_user_ns(mm->user_ns);
> + mm_pasid_drop(mm);
> free_mm(mm);
> }
> EXPORT_SYMBOL_GPL(__mmdrop);
> @@ -1190,7 +1191,6 @@ static inline void __mmput(struct mm_struct *mm)
> }
> if (mm->binfmt)
> module_put(mm->binfmt->module);
> - mm_pasid_drop(mm);
> mmdrop(mm);
> }
>
Hi Jean-Philippe,
On Mon, 25 Apr 2022 17:13:02 +0100, Jean-Philippe Brucker
<[email protected]> wrote:
> Hi Jacob,
>
> On Mon, Apr 25, 2022 at 08:34:44AM -0700, Jacob Pan wrote:
> > Hi Jean-Philippe,
> >
> > On Mon, 25 Apr 2022 15:26:40 +0100, Jean-Philippe Brucker
> > <[email protected]> wrote:
> >
> > > On Mon, Apr 25, 2022 at 07:18:36AM -0700, Dave Hansen wrote:
> > > > On 4/25/22 06:53, Jean-Philippe Brucker wrote:
> > > > > On Sat, Apr 23, 2022 at 07:13:39PM +0800, [email protected]
> > > > > wrote:
> > > > >>>> On 5.17
> > > > >>>> fops_release is called automatically, as well as
> > > > >>>> iommu_sva_unbind_device. On 5.18-rc1.
> > > > >>>> fops_release is not called, have to manually call close(fd)
> > > > >>> Right that's weird
> > > > >> Looks it is caused by the fix patch, via mmget, which may add
> > > > >> refcount of fd.
> > > > > Yes indirectly I think: when the process mmaps the queue,
> > > > > mmap_region() takes a reference to the uacce fd. That reference is
> > > > > released either by explicit close() or munmap(), or by exit_mmap()
> > > > > (which is triggered by mmput()). Since there is an mm->fd
> > > > > dependency, we cannot add a fd->mm dependency, so no
> > > > > mmget()/mmput() in bind()/unbind().
> > > > >
> > > > > I guess we should go back to refcounted PASIDs instead, to avoid
> > > > > freeing them until unbind().
> > > >
> > > > Yeah, this is a bit gnarly for -rc4. Let's just make sure there's
> > > > nothing else simple we can do.
> > > >
> > > > How does the IOMMU hardware know that all activity to a given PASID
> > > > is finished? That activity should, today, be independent of an mm
> > > > or a fd's lifetime.
> > >
> > > In the case of uacce, it's tied to the fd lifetime: opening an
> > > accelerator queue calls iommu_sva_bind_device(), which sets up the
> > > PASID context in the IOMMU. Closing the queue calls
> > > iommu_sva_unbind_device() which destroys the PASID context (after the
> > > device driver stopped all DMA for this PASID).
> > >
> > For VT-d, it is essentially the same flow except managed by the
> > individual drivers such as DSA.
> > If free() happens before unbind(), we deactivate the PASIDs and suppress
> > faults from the device. When the unbind finally comes, we finalize the
> > PASID teardown. It seems we have a need for an intermediate state where
> > PASID is "pending free"?
>
> Yes we do have that state, though I'm not sure we need to make it explicit
> in the ioasid allocator.
>
IMHO, making it explicit would fail ioasid_get() on a "pending free" PASID.
Making free a one-way trip and prevent further complications.
> Could we move mm_pasid_drop() to __mmdrop() instead of __mmput()? For Arm
> we do need to hold the mm_count until unbind(), and mmgrab()/mmdrop() is
> also part of Lu's rework [1].
>
Yes, I would agree. IIRC, Fenghua's early patch was doing pasid drop
in mmdrop. Maybe I missed something.
> Thanks,
> Jean
>
> [1]
> https://lore.kernel.org/linux-iommu/[email protected]/
Thanks,
Jacob
On 2022/4/26 下午12:36, Fenghua Yu wrote:
> On Tue, Apr 26, 2022 at 12:28:00PM +0800, Zhangfei Gao wrote:
>> Hi, Jean
>>
>> On 2022/4/26 上午12:13, Jean-Philippe Brucker wrote:
>>> Hi Jacob,
>>>
>>> On Mon, Apr 25, 2022 at 08:34:44AM -0700, Jacob Pan wrote:
>>>> Hi Jean-Philippe,
>>>>
>>>> On Mon, 25 Apr 2022 15:26:40 +0100, Jean-Philippe Brucker
>>>> <[email protected]> wrote:
>>>>
>>>>> On Mon, Apr 25, 2022 at 07:18:36AM -0700, Dave Hansen wrote:
>>>>>> On 4/25/22 06:53, Jean-Philippe Brucker wrote:
>>>>>>> On Sat, Apr 23, 2022 at 07:13:39PM +0800, [email protected]
>>>>>>> wrote:
>>>>>>>>>> On 5.17
>>>>>>>>>> fops_release is called automatically, as well as
>>>>>>>>>> iommu_sva_unbind_device. On 5.18-rc1.
>>>>>>>>>> fops_release is not called, have to manually call close(fd)
>>>>>>>>> Right that's weird
>>>>>>>> Looks it is caused by the fix patch, via mmget, which may add
>>>>>>>> refcount of fd.
>>>>>>> Yes indirectly I think: when the process mmaps the queue,
>>>>>>> mmap_region() takes a reference to the uacce fd. That reference is
>>>>>>> released either by explicit close() or munmap(), or by exit_mmap()
>>>>>>> (which is triggered by mmput()). Since there is an mm->fd dependency,
>>>>>>> we cannot add a fd->mm dependency, so no mmget()/mmput() in
>>>>>>> bind()/unbind().
>>>>>>>
>>>>>>> I guess we should go back to refcounted PASIDs instead, to avoid
>>>>>>> freeing them until unbind().
>>>>>> Yeah, this is a bit gnarly for -rc4. Let's just make sure there's
>>>>>> nothing else simple we can do.
>>>>>>
>>>>>> How does the IOMMU hardware know that all activity to a given PASID is
>>>>>> finished? That activity should, today, be independent of an mm or a
>>>>>> fd's lifetime.
>>>>> In the case of uacce, it's tied to the fd lifetime: opening an accelerator
>>>>> queue calls iommu_sva_bind_device(), which sets up the PASID context in
>>>>> the IOMMU. Closing the queue calls iommu_sva_unbind_device() which
>>>>> destroys the PASID context (after the device driver stopped all DMA for
>>>>> this PASID).
>>>>>
>>>> For VT-d, it is essentially the same flow except managed by the individual
>>>> drivers such as DSA.
>>>> If free() happens before unbind(), we deactivate the PASIDs and suppress
>>>> faults from the device. When the unbind finally comes, we finalize the
>>>> PASID teardown. It seems we have a need for an intermediate state where
>>>> PASID is "pending free"?
>>> Yes we do have that state, though I'm not sure we need to make it explicit
>>> in the ioasid allocator.
>>>
>>> Could we move mm_pasid_drop() to __mmdrop() instead of __mmput()? For Arm
>>> we do need to hold the mm_count until unbind(), and mmgrab()/mmdrop() is
>>> also part of Lu's rework [1].
>> Move mm_pasid_drop to __mmdrop looks workable.
>>
>> The nginx works since ioasid is not freed when master exit until nginx stop.
>>
>> The ioasid does not free immediately when fops_release->unbind finished.
>> Instead, __mmdrop happens a bit lazy, which has no issue though
>> I passed 10000 times exit without unbind test, the pasid allocation is ok.
>>
>> Thanks
>>
>>
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index 9796897560ab..60f417f69367 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -792,6 +792,8 @@ void __mmdrop(struct mm_struct *mm)
>> mmu_notifier_subscriptions_destroy(mm);
>> check_mm(mm);
>> put_user_ns(mm->user_ns);
>> + mm_pasid_drop(mm);
>> free_mm(mm);
>> }
>> EXPORT_SYMBOL_GPL(__mmdrop);
>> @@ -1190,7 +1192,6 @@ static inline void __mmput(struct mm_struct *mm)
>> }
>> if (mm->binfmt)
>> module_put(mm->binfmt->module);
>> - mm_pasid_drop(mm);
>> mmdrop(mm);
>> }
> Thank you very much, Zhangfei!
>
> I just now sent out an identical patch. It works on X86 as well.
>
> So seems the patch is the right fix.
>
> Either you can send out the patch or I add your Signed-off-by? Either way
> is OK for me.
Thanks Fenghua,
It does not matter. Have added tested-by.
I was in stress test for checking the pasid free, since it was freed lazily.
Thank all for the help, a bit nervous, since it is rc4 now.
On 4/25/22 09:40, Jean-Philippe Brucker wrote:
> The problem is that we'd have to request the device driver to stop DMA
> before we can destroy the context and free the PASID. We did consider
> doing this in the release() MMU notifier, but there were concerns about
> blocking mmput() for too long (for example
> https://lore.kernel.org/linux-iommu/[email protected]/
> though I think there was a more recent discussion). We also need to drain
> the PRI and fault queues to get rid of all references to that PASID.
Is the concern truly about blocking mmput() itself? Or, is it about
releasing the resources associated with the mm?
On 4/26/22 09:48, Jean-Philippe Brucker wrote:
> On Tue, Apr 26, 2022 at 08:27:00AM -0700, Dave Hansen wrote:
>> On 4/25/22 09:40, Jean-Philippe Brucker wrote:
>>> The problem is that we'd have to request the device driver to stop DMA
>>> before we can destroy the context and free the PASID. We did consider
>>> doing this in the release() MMU notifier, but there were concerns about
>>> blocking mmput() for too long (for example
>>> https://lore.kernel.org/linux-iommu/[email protected]/
>>> though I think there was a more recent discussion). We also need to drain
>>> the PRI and fault queues to get rid of all references to that PASID.
>> Is the concern truly about blocking mmput() itself? Or, is it about
>> releasing the resources associated with the mm?
> The latter I think, this one was about releasing pages as fast as possible
> if the process is picked by the OOM killer.
We're tying the PASID to the life of the mm itself, not the mm's address
space. That means the PASID should be tied to
mmgrab()/mmdrop()/mm->mm_count.
The address space is what the OOM killer is after. That gets refcounted
with mmget()/mmput()/mm->mm_users. The OOM killer is satiated by the
page freeing done in __mmput()->exit_mmap().
Also, all the VMAs should be gone after exit_mmap(). So, even if
vma->vm_file was holding a reference to a device driver, that reference
should be gone by the time __mmdrop() is actually freeing the PASID.
On Tue, Apr 26, 2022 at 08:27:00AM -0700, Dave Hansen wrote:
> On 4/25/22 09:40, Jean-Philippe Brucker wrote:
> > The problem is that we'd have to request the device driver to stop DMA
> > before we can destroy the context and free the PASID. We did consider
> > doing this in the release() MMU notifier, but there were concerns about
> > blocking mmput() for too long (for example
> > https://lore.kernel.org/linux-iommu/[email protected]/
> > though I think there was a more recent discussion). We also need to drain
> > the PRI and fault queues to get rid of all references to that PASID.
>
> Is the concern truly about blocking mmput() itself? Or, is it about
> releasing the resources associated with the mm?
The latter I think, this one was about releasing pages as fast as possible
if the process is picked by the OOM killer.
Thanks,
Jean
Hi, Dave and Jean,
On Tue, Apr 26, 2022 at 01:04:45PM +0800, Zhangfei Gao wrote:
>
>
> On 2022/4/26 下午12:20, Fenghua Yu wrote:
> > Hi, Jean and Zhangfei,
> >
> > On Mon, Apr 25, 2022 at 05:13:02PM +0100, Jean-Philippe Brucker wrote:
> > > Could we move mm_pasid_drop() to __mmdrop() instead of __mmput()? For Arm
> > > we do need to hold the mm_count until unbind(), and mmgrab()/mmdrop() is
> > > also part of Lu's rework [1].
> > Is this a right fix for the issue? Could you please test it on ARM?
> > I don't have an ARM machine.
> >
> > Thanks.
> >
> > -Fenghua
> >
> > From 84aa68f6174439d863c40cdc2db0e1b89d620dd0 Mon Sep 17 00:00:00 2001
> > From: Fenghua Yu <[email protected]>
> > Date: Fri, 15 Apr 2022 00:51:33 -0700
> > Subject: [PATCH] iommu/sva: Fix PASID use-after-free issue
> >
> > A PASID might be still used on ARM after it is freed in __mmput().
> >
> > process:
> > open()->sva_bind()->ioasid_alloc() = N; // Get PASID N for the mm
> > exit();
> > exit_mm()->__mmput()->mm_pasid_drop()->mm->pasid = -1; // PASID -1
> > exit_files()->release(dev)->sva_unbind()->use mm->pasid; // Failure
> >
> > To avoid the use-after-free issue, free the PASID after no device uses it,
> > i.e. after all devices are unbound from the mm.
> >
> > sva_bind()/sva_unbind() call mmgrab()/mmdrop() to track mm->mm_count.
> > __mmdrop() is called only after mm->mm_count is zero. So freeing the PASID
> > in __mmdrop() guarantees the PASID is safely freed only after no device
> > is bound to the mm.
> >
> > Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm exit")
> >
> > Reported-by: Zhangfei Gao <[email protected]>
> > Suggested-by: Jean-Philippe Brucker <[email protected]>
> > Suggested-by: Jacob Pan <[email protected]>
> > Signed-off-by: Fenghua Yu <[email protected]>
> Thanks for the fix.
>
> Tested-by: Zhangfei Gao <[email protected]>
>
>
> > ---
> > kernel/fork.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 9796897560ab..35a3beff140b 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -792,6 +792,7 @@ void __mmdrop(struct mm_struct *mm)
> > mmu_notifier_subscriptions_destroy(mm);
> > check_mm(mm);
> > put_user_ns(mm->user_ns);
> > + mm_pasid_drop(mm);
> > free_mm(mm);
> > }
> > EXPORT_SYMBOL_GPL(__mmdrop);
> > @@ -1190,7 +1191,6 @@ static inline void __mmput(struct mm_struct *mm)
> > }
> > if (mm->binfmt)
> > module_put(mm->binfmt->module);
> > - mm_pasid_drop(mm);
> > mmdrop(mm);
> > }
>
Is this patch a good fix? Will you help push the fix into upstream?
Thank you very much!
-Fenghua
On Wed, Apr 27, 2022 at 05:54:57PM -0700, Fenghua Yu wrote:
> Hi, Dave and Jean,
>
> On Tue, Apr 26, 2022 at 01:04:45PM +0800, Zhangfei Gao wrote:
> >
> >
> > On 2022/4/26 下午12:20, Fenghua Yu wrote:
> > > Hi, Jean and Zhangfei,
> > >
> > > On Mon, Apr 25, 2022 at 05:13:02PM +0100, Jean-Philippe Brucker wrote:
> > > > Could we move mm_pasid_drop() to __mmdrop() instead of __mmput()? For Arm
> > > > we do need to hold the mm_count until unbind(), and mmgrab()/mmdrop() is
> > > > also part of Lu's rework [1].
> > > Is this a right fix for the issue? Could you please test it on ARM?
> > > I don't have an ARM machine.
> > >
> > > Thanks.
> > >
> > > -Fenghua
> > >
> > > From 84aa68f6174439d863c40cdc2db0e1b89d620dd0 Mon Sep 17 00:00:00 2001
> > > From: Fenghua Yu <[email protected]>
> > > Date: Fri, 15 Apr 2022 00:51:33 -0700
> > > Subject: [PATCH] iommu/sva: Fix PASID use-after-free issue
> > >
> > > A PASID might be still used on ARM after it is freed in __mmput().
> > >
> > > process:
> > > open()->sva_bind()->ioasid_alloc() = N; // Get PASID N for the mm
> > > exit();
> > > exit_mm()->__mmput()->mm_pasid_drop()->mm->pasid = -1; // PASID -1
> > > exit_files()->release(dev)->sva_unbind()->use mm->pasid; // Failure
> > >
> > > To avoid the use-after-free issue, free the PASID after no device uses it,
> > > i.e. after all devices are unbound from the mm.
> > >
> > > sva_bind()/sva_unbind() call mmgrab()/mmdrop() to track mm->mm_count.
> > > __mmdrop() is called only after mm->mm_count is zero. So freeing the PASID
> > > in __mmdrop() guarantees the PASID is safely freed only after no device
> > > is bound to the mm.
> > >
> > > Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm exit")
> > >
> > > Reported-by: Zhangfei Gao <[email protected]>
> > > Suggested-by: Jean-Philippe Brucker <[email protected]>
> > > Suggested-by: Jacob Pan <[email protected]>
> > > Signed-off-by: Fenghua Yu <[email protected]>
> > Thanks for the fix.
> >
> > Tested-by: Zhangfei Gao <[email protected]>
> >
> >
> > > ---
> > > kernel/fork.c | 2 +-
> > > 1 file changed, 1 insertion(+), 1 deletion(-)
> > >
> > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > index 9796897560ab..35a3beff140b 100644
> > > --- a/kernel/fork.c
> > > +++ b/kernel/fork.c
> > > @@ -792,6 +792,7 @@ void __mmdrop(struct mm_struct *mm)
> > > mmu_notifier_subscriptions_destroy(mm);
> > > check_mm(mm);
> > > put_user_ns(mm->user_ns);
> > > + mm_pasid_drop(mm);
> > > free_mm(mm);
> > > }
> > > EXPORT_SYMBOL_GPL(__mmdrop);
> > > @@ -1190,7 +1191,6 @@ static inline void __mmput(struct mm_struct *mm)
> > > }
> > > if (mm->binfmt)
> > > module_put(mm->binfmt->module);
> > > - mm_pasid_drop(mm);
> > > mmdrop(mm);
> > > }
> >
>
> Is this patch a good fix? Will you help push the fix into upstream?
Yes, I think it's the right thing to do for now. Could you resend it
separately so it gets visibility from the maintainers?
Reviewed-by: Jean-Philippe Brucker <[email protected]>
On Tue, Apr 26, 2022 at 04:31:57PM -0700, Dave Hansen wrote:
> On 4/26/22 09:48, Jean-Philippe Brucker wrote:
> > On Tue, Apr 26, 2022 at 08:27:00AM -0700, Dave Hansen wrote:
> >> On 4/25/22 09:40, Jean-Philippe Brucker wrote:
> >>> The problem is that we'd have to request the device driver to stop DMA
> >>> before we can destroy the context and free the PASID. We did consider
> >>> doing this in the release() MMU notifier, but there were concerns about
> >>> blocking mmput() for too long (for example
> >>> https://lore.kernel.org/linux-iommu/[email protected]/
> >>> though I think there was a more recent discussion). We also need to drain
> >>> the PRI and fault queues to get rid of all references to that PASID.
> >> Is the concern truly about blocking mmput() itself? Or, is it about
> >> releasing the resources associated with the mm?
> > The latter I think, this one was about releasing pages as fast as possible
> > if the process is picked by the OOM killer.
>
> We're tying the PASID to the life of the mm itself, not the mm's address
> space. That means the PASID should be tied to
> mmgrab()/mmdrop()/mm->mm_count.
>
> The address space is what the OOM killer is after. That gets refcounted
> with mmget()/mmput()/mm->mm_users. The OOM killer is satiated by the
> page freeing done in __mmput()->exit_mmap().
>
> Also, all the VMAs should be gone after exit_mmap(). So, even if
> vma->vm_file was holding a reference to a device driver, that reference
> should be gone by the time __mmdrop() is actually freeing the PASID.
I agree with all that. The concern was about tearing down the PASID in the
IOMMU and device from the release() MMU notifier, which would happen in
exit_mmap(). But doing the teardown at or before __mmdrop() is fine. And
since the IOMMU drivers need to hold mm->mm_count anyway between bind()
and unbind(), I think Fenghua's fix works.
Thanks,
Jean
On Thu, Apr 28, 2022 at 08:09:04AM -0700, Dave Hansen wrote:
> On 4/25/22 21:20, Fenghua Yu wrote:
> >>From 84aa68f6174439d863c40cdc2db0e1b89d620dd0 Mon Sep 17 00:00:00 2001
> > From: Fenghua Yu <[email protected]>
> > Date: Fri, 15 Apr 2022 00:51:33 -0700
> > Subject: [PATCH] iommu/sva: Fix PASID use-after-free issue
> >
> > A PASID might be still used on ARM after it is freed in __mmput().
>
> Is it really just ARM?
>
> > process:
> > open()->sva_bind()->ioasid_alloc() = N; // Get PASID N for the mm
> > exit();
> > exit_mm()->__mmput()->mm_pasid_drop()->mm->pasid = -1; // PASID -1
> > exit_files()->release(dev)->sva_unbind()->use mm->pasid; // Failure
> >
> > To avoid the use-after-free issue, free the PASID after no device uses it,
> > i.e. after all devices are unbound from the mm.
> >
> > sva_bind()/sva_unbind() call mmgrab()/mmdrop() to track mm->mm_count.
> > __mmdrop() is called only after mm->mm_count is zero. So freeing the PASID
> > in __mmdrop() guarantees the PASID is safely freed only after no device
> > is bound to the mm.
>
> Does this changelog work for everyone?
>
> ==
>
> tl;dr: The PASID is being freed too early. It needs to stay around
> until after device drivers that might be using it have had a chance to
> clear it out of the hardware.
>
> --
>
> As a reminder:
>
> mmget() /mmput() refcount the mm's address space
> mmgrab()/mmdrop() refcount the mm itself
>
> The PASID is currently tied to the life of the mm's address space and
> freed in __mmput(). This makes logical sense because the PASID can't be
> used once the address space is gone.
>
> But, this misses an important point: even after the address space is
> gone, the PASID will still be programmed into a device. Device drivers
> might, for instance, still need to flush operations that are outstanding
> and need to use that PASID. They do this at ->release() time.
It's not really clear which release() this is. For us it's file descriptor
release() (not MMU notifier release(), which is how I initially understood
this sentence)
>
> Device drivers hold a reference on the mm itself and drop it at
> ->release() time. But, the device driver holds a reference mm itself,
"to the mm"
(To be pendantic it's the IOMMU driver that holds this reference, and
the device driver calls the IOMMU driver to release it, but the idea is
the same.)
> not the address space. The address space (and the PASID) is long gone
> by the time the driver tries to clean up. This is effectively a
> use-after-free bug on the PASID.
>
> To fix this, move the PASID free operation from __mmput() to __mmdrop().
> This ensures that the device drivers' existing mmgrab() keeps the PASID
> allocated until they drop their mm reference.
Good changelog otherwise
Thanks,
Jean
>
> > kernel/fork.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 9796897560ab..35a3beff140b 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -792,6 +792,7 @@ void __mmdrop(struct mm_struct *mm)
> > mmu_notifier_subscriptions_destroy(mm);
> > check_mm(mm);
> > put_user_ns(mm->user_ns);
> > + mm_pasid_drop(mm);
> > free_mm(mm);
> > }
> > EXPORT_SYMBOL_GPL(__mmdrop);
> > @@ -1190,7 +1191,6 @@ static inline void __mmput(struct mm_struct *mm)
> > }
> > if (mm->binfmt)
> > module_put(mm->binfmt->module);
> > - mm_pasid_drop(mm);
> > mmdrop(mm);
> > }
> >
>
Hi, Dave,
On Thu, Apr 28, 2022 at 08:09:04AM -0700, Dave Hansen wrote:
> On 4/25/22 21:20, Fenghua Yu wrote:
> >>From 84aa68f6174439d863c40cdc2db0e1b89d620dd0 Mon Sep 17 00:00:00 2001
> > From: Fenghua Yu <[email protected]>
> > Date: Fri, 15 Apr 2022 00:51:33 -0700
> > Subject: [PATCH] iommu/sva: Fix PASID use-after-free issue
> >
> > A PASID might be still used on ARM after it is freed in __mmput().
>
> Is it really just ARM?
Actually it should happen on X86 as well. I will remove "on ARM" in the
changelog.
>
> > process:
> > open()->sva_bind()->ioasid_alloc() = N; // Get PASID N for the mm
> > exit();
> > exit_mm()->__mmput()->mm_pasid_drop()->mm->pasid = -1; // PASID -1
> > exit_files()->release(dev)->sva_unbind()->use mm->pasid; // Failure
> >
> > To avoid the use-after-free issue, free the PASID after no device uses it,
> > i.e. after all devices are unbound from the mm.
> >
> > sva_bind()/sva_unbind() call mmgrab()/mmdrop() to track mm->mm_count.
> > __mmdrop() is called only after mm->mm_count is zero. So freeing the PASID
> > in __mmdrop() guarantees the PASID is safely freed only after no device
> > is bound to the mm.
>
> Does this changelog work for everyone?
>
> ==
>
> tl;dr: The PASID is being freed too early. It needs to stay around
> until after device drivers that might be using it have had a chance to
> clear it out of the hardware.
>
Do you want me to change the changlog to add both this paragraph and the
following paragraph?
> --
>
> As a reminder:
>
> mmget() /mmput() refcount the mm's address space
> mmgrab()/mmdrop() refcount the mm itself
>
> The PASID is currently tied to the life of the mm's address space and
> freed in __mmput(). This makes logical sense because the PASID can't be
> used once the address space is gone.
>
> But, this misses an important point: even after the address space is
> gone, the PASID will still be programmed into a device. Device drivers
> might, for instance, still need to flush operations that are outstanding
> and need to use that PASID. They do this at ->release() time.
>
> Device drivers hold a reference on the mm itself and drop it at
> ->release() time. But, the device driver holds a reference mm itself,
> not the address space. The address space (and the PASID) is long gone
> by the time the driver tries to clean up. This is effectively a
> use-after-free bug on the PASID.
>
> To fix this, move the PASID free operation from __mmput() to __mmdrop().
> This ensures that the device drivers' existing mmgrab() keeps the PASID
> allocated until they drop their mm reference.
>
Thank you very much!
-Fenghua
On 4/25/22 21:20, Fenghua Yu wrote:
>>From 84aa68f6174439d863c40cdc2db0e1b89d620dd0 Mon Sep 17 00:00:00 2001
> From: Fenghua Yu <[email protected]>
> Date: Fri, 15 Apr 2022 00:51:33 -0700
> Subject: [PATCH] iommu/sva: Fix PASID use-after-free issue
>
> A PASID might be still used on ARM after it is freed in __mmput().
Is it really just ARM?
> process:
> open()->sva_bind()->ioasid_alloc() = N; // Get PASID N for the mm
> exit();
> exit_mm()->__mmput()->mm_pasid_drop()->mm->pasid = -1; // PASID -1
> exit_files()->release(dev)->sva_unbind()->use mm->pasid; // Failure
>
> To avoid the use-after-free issue, free the PASID after no device uses it,
> i.e. after all devices are unbound from the mm.
>
> sva_bind()/sva_unbind() call mmgrab()/mmdrop() to track mm->mm_count.
> __mmdrop() is called only after mm->mm_count is zero. So freeing the PASID
> in __mmdrop() guarantees the PASID is safely freed only after no device
> is bound to the mm.
Does this changelog work for everyone?
==
tl;dr: The PASID is being freed too early. It needs to stay around
until after device drivers that might be using it have had a chance to
clear it out of the hardware.
--
As a reminder:
mmget() /mmput() refcount the mm's address space
mmgrab()/mmdrop() refcount the mm itself
The PASID is currently tied to the life of the mm's address space and
freed in __mmput(). This makes logical sense because the PASID can't be
used once the address space is gone.
But, this misses an important point: even after the address space is
gone, the PASID will still be programmed into a device. Device drivers
might, for instance, still need to flush operations that are outstanding
and need to use that PASID. They do this at ->release() time.
Device drivers hold a reference on the mm itself and drop it at
->release() time. But, the device driver holds a reference mm itself,
not the address space. The address space (and the PASID) is long gone
by the time the driver tries to clean up. This is effectively a
use-after-free bug on the PASID.
To fix this, move the PASID free operation from __mmput() to __mmdrop().
This ensures that the device drivers' existing mmgrab() keeps the PASID
allocated until they drop their mm reference.
> kernel/fork.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 9796897560ab..35a3beff140b 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -792,6 +792,7 @@ void __mmdrop(struct mm_struct *mm)
> mmu_notifier_subscriptions_destroy(mm);
> check_mm(mm);
> put_user_ns(mm->user_ns);
> + mm_pasid_drop(mm);
> free_mm(mm);
> }
> EXPORT_SYMBOL_GPL(__mmdrop);
> @@ -1190,7 +1191,6 @@ static inline void __mmput(struct mm_struct *mm)
> }
> if (mm->binfmt)
> module_put(mm->binfmt->module);
> - mm_pasid_drop(mm);
> mmdrop(mm);
> }
>
Hi, Baolu,
On Fri, Apr 29, 2022 at 03:53:57PM +0800, Baolu Lu wrote:
> On 2022/4/28 16:39, Jean-Philippe Brucker wrote:
> > > The address space is what the OOM killer is after. That gets refcounted
> > > with mmget()/mmput()/mm->mm_users. The OOM killer is satiated by the
> > > page freeing done in __mmput()->exit_mmap().
> > >
> > > Also, all the VMAs should be gone after exit_mmap(). So, even if
> > > vma->vm_file was holding a reference to a device driver, that reference
> > > should be gone by the time __mmdrop() is actually freeing the PASID.
> >
> > I agree with all that. The concern was about tearing down the PASID in the
> > IOMMU and device from the release() MMU notifier, which would happen in
> > exit_mmap(). But doing the teardown at or before __mmdrop() is fine. And
> > since the IOMMU drivers need to hold mm->mm_count anyway between bind()
> > and unbind(), I think Fenghua's fix works.
>
> But I didn't find mmgrab()/mmdrop() get called in both arm and intel
> IOMMU drivers.
>
> $ git grep mmgrab drivers/iommu/
> [no output]
>
> Do we need to add these in a separated fix patch, or I missed anything
> here?
On both ARM and X86, sva_bind() calls mmu_notifier_register()->mmgrab() and
sva_unbind() calls mmu_notifier_unregister()/mmu_notifier_put()->mmdrop().
So mm->mm_count are already counted in existing ARM and X86 binding and
unbinding. The fix patch just frees the PASID in __mmdrop() after
no more mm->mm_count.
There is no need to add extra mmgrab() and mmdrop() pair.
Thanks.
-Fenghua
Hi, Jean and Baolu,
On Fri, Apr 29, 2022 at 03:34:36PM +0100, Jean-Philippe Brucker wrote:
> On Fri, Apr 29, 2022 at 06:51:17AM -0700, Fenghua Yu wrote:
> > Hi, Baolu,
> >
> > On Fri, Apr 29, 2022 at 03:53:57PM +0800, Baolu Lu wrote:
> > > On 2022/4/28 16:39, Jean-Philippe Brucker wrote:
> > > > > The address space is what the OOM killer is after. That gets refcounted
> > > > > with mmget()/mmput()/mm->mm_users. The OOM killer is satiated by the
> > > > > page freeing done in __mmput()->exit_mmap().
> > > > >
> > > > > Also, all the VMAs should be gone after exit_mmap(). So, even if
> > > > > vma->vm_file was holding a reference to a device driver, that reference
> > > > > should be gone by the time __mmdrop() is actually freeing the PASID.
> > > >
> > > > I agree with all that. The concern was about tearing down the PASID in the
> > > > IOMMU and device from the release() MMU notifier, which would happen in
> > > > exit_mmap(). But doing the teardown at or before __mmdrop() is fine. And
> > > > since the IOMMU drivers need to hold mm->mm_count anyway between bind()
> > > > and unbind(), I think Fenghua's fix works.
> > >
> > > But I didn't find mmgrab()/mmdrop() get called in both arm and intel
> > > IOMMU drivers.
> > >
> > > $ git grep mmgrab drivers/iommu/
> > > [no output]
> > >
> > > Do we need to add these in a separated fix patch, or I missed anything
> > > here?
> >
> > On both ARM and X86, sva_bind() calls mmu_notifier_register()->mmgrab() and
> > sva_unbind() calls mmu_notifier_unregister()/mmu_notifier_put()->mmdrop().
>
> Yes, although for Arm I realized the mmu_notifier grab wasn't sufficient
> so I sent a separate fix that should go in 5.18 as well
> https://lore.kernel.org/linux-iommu/[email protected]/
> The Arm driver still touches the arch mm context after mmu_notifier_put().
> I don't think X86 has that problem.
I think so too. On X86, the mm is not used after mmu_notifier_unregister().
In case of supervisor mode SVM (i.e. svm is bound to init_mm), the code
is right too because init_mm and its PASID cannot be dropped and
mmu_notifier_register()/mmu_notifier_unregister() are not called.
So I think no extra fix patch is needed on X86.
Thanks.
-Fenghua
On 4/28/22 08:28, Fenghua Yu wrote:
> Do you want me to change the changlog to add both this paragraph and the
> following paragraph?
Yes, as long as everyone agrees that it captures the issue well.
On 2022/4/28 16:39, Jean-Philippe Brucker wrote:
> On Tue, Apr 26, 2022 at 04:31:57PM -0700, Dave Hansen wrote:
>> On 4/26/22 09:48, Jean-Philippe Brucker wrote:
>>> On Tue, Apr 26, 2022 at 08:27:00AM -0700, Dave Hansen wrote:
>>>> On 4/25/22 09:40, Jean-Philippe Brucker wrote:
>>>>> The problem is that we'd have to request the device driver to stop DMA
>>>>> before we can destroy the context and free the PASID. We did consider
>>>>> doing this in the release() MMU notifier, but there were concerns about
>>>>> blocking mmput() for too long (for example
>>>>> https://lore.kernel.org/linux-iommu/[email protected]/
>>>>> though I think there was a more recent discussion). We also need to drain
>>>>> the PRI and fault queues to get rid of all references to that PASID.
>>>> Is the concern truly about blocking mmput() itself? Or, is it about
>>>> releasing the resources associated with the mm?
>>> The latter I think, this one was about releasing pages as fast as possible
>>> if the process is picked by the OOM killer.
>>
>> We're tying the PASID to the life of the mm itself, not the mm's address
>> space. That means the PASID should be tied to
>> mmgrab()/mmdrop()/mm->mm_count.
>>
>> The address space is what the OOM killer is after. That gets refcounted
>> with mmget()/mmput()/mm->mm_users. The OOM killer is satiated by the
>> page freeing done in __mmput()->exit_mmap().
>>
>> Also, all the VMAs should be gone after exit_mmap(). So, even if
>> vma->vm_file was holding a reference to a device driver, that reference
>> should be gone by the time __mmdrop() is actually freeing the PASID.
>
> I agree with all that. The concern was about tearing down the PASID in the
> IOMMU and device from the release() MMU notifier, which would happen in
> exit_mmap(). But doing the teardown at or before __mmdrop() is fine. And
> since the IOMMU drivers need to hold mm->mm_count anyway between bind()
> and unbind(), I think Fenghua's fix works.
But I didn't find mmgrab()/mmdrop() get called in both arm and intel
IOMMU drivers.
$ git grep mmgrab drivers/iommu/
[no output]
Do we need to add these in a separated fix patch, or I missed anything
here?
Best regards,
baolu
On 2022/4/30 06:19, Fenghua Yu wrote:
> Hi, Jean and Baolu,
>
> On Fri, Apr 29, 2022 at 03:34:36PM +0100, Jean-Philippe Brucker wrote:
>> On Fri, Apr 29, 2022 at 06:51:17AM -0700, Fenghua Yu wrote:
>>> Hi, Baolu,
>>>
>>> On Fri, Apr 29, 2022 at 03:53:57PM +0800, Baolu Lu wrote:
>>>> On 2022/4/28 16:39, Jean-Philippe Brucker wrote:
>>>>>> The address space is what the OOM killer is after. That gets refcounted
>>>>>> with mmget()/mmput()/mm->mm_users. The OOM killer is satiated by the
>>>>>> page freeing done in __mmput()->exit_mmap().
>>>>>>
>>>>>> Also, all the VMAs should be gone after exit_mmap(). So, even if
>>>>>> vma->vm_file was holding a reference to a device driver, that reference
>>>>>> should be gone by the time __mmdrop() is actually freeing the PASID.
>>>>>
>>>>> I agree with all that. The concern was about tearing down the PASID in the
>>>>> IOMMU and device from the release() MMU notifier, which would happen in
>>>>> exit_mmap(). But doing the teardown at or before __mmdrop() is fine. And
>>>>> since the IOMMU drivers need to hold mm->mm_count anyway between bind()
>>>>> and unbind(), I think Fenghua's fix works.
>>>>
>>>> But I didn't find mmgrab()/mmdrop() get called in both arm and intel
>>>> IOMMU drivers.
>>>>
>>>> $ git grep mmgrab drivers/iommu/
>>>> [no output]
>>>>
>>>> Do we need to add these in a separated fix patch, or I missed anything
>>>> here?
>>>
>>> On both ARM and X86, sva_bind() calls mmu_notifier_register()->mmgrab() and
>>> sva_unbind() calls mmu_notifier_unregister()/mmu_notifier_put()->mmdrop().
>>
>> Yes, although for Arm I realized the mmu_notifier grab wasn't sufficient
>> so I sent a separate fix that should go in 5.18 as well
>> https://lore.kernel.org/linux-iommu/[email protected]/
>> The Arm driver still touches the arch mm context after mmu_notifier_put().
>> I don't think X86 has that problem.
>
> I think so too. On X86, the mm is not used after mmu_notifier_unregister().
>
> In case of supervisor mode SVM (i.e. svm is bound to init_mm), the code
> is right too because init_mm and its PASID cannot be dropped and
> mmu_notifier_register()/mmu_notifier_unregister() are not called.
>
> So I think no extra fix patch is needed on X86.
Thanks, Fenghua and Jean. It's clear to me now.
Jean, another quick question about the iommu_sva_bind_device()
/**
* iommu_sva_bind_device() - Bind a process address space to a device
* @dev: the device
* @mm: the mm to bind, caller must hold a reference to it
* @drvdata: opaque data pointer to pass to bind callback
This interface requires the caller to take a reference to mm. Which
reference should it take, mm->mm_count or mm->mm_users? It's better to
make it explicit in this comment.
Best regards,
baolu
On Fri, Apr 29, 2022 at 06:51:17AM -0700, Fenghua Yu wrote:
> Hi, Baolu,
>
> On Fri, Apr 29, 2022 at 03:53:57PM +0800, Baolu Lu wrote:
> > On 2022/4/28 16:39, Jean-Philippe Brucker wrote:
> > > > The address space is what the OOM killer is after. That gets refcounted
> > > > with mmget()/mmput()/mm->mm_users. The OOM killer is satiated by the
> > > > page freeing done in __mmput()->exit_mmap().
> > > >
> > > > Also, all the VMAs should be gone after exit_mmap(). So, even if
> > > > vma->vm_file was holding a reference to a device driver, that reference
> > > > should be gone by the time __mmdrop() is actually freeing the PASID.
> > >
> > > I agree with all that. The concern was about tearing down the PASID in the
> > > IOMMU and device from the release() MMU notifier, which would happen in
> > > exit_mmap(). But doing the teardown at or before __mmdrop() is fine. And
> > > since the IOMMU drivers need to hold mm->mm_count anyway between bind()
> > > and unbind(), I think Fenghua's fix works.
> >
> > But I didn't find mmgrab()/mmdrop() get called in both arm and intel
> > IOMMU drivers.
> >
> > $ git grep mmgrab drivers/iommu/
> > [no output]
> >
> > Do we need to add these in a separated fix patch, or I missed anything
> > here?
>
> On both ARM and X86, sva_bind() calls mmu_notifier_register()->mmgrab() and
> sva_unbind() calls mmu_notifier_unregister()/mmu_notifier_put()->mmdrop().
Yes, although for Arm I realized the mmu_notifier grab wasn't sufficient
so I sent a separate fix that should go in 5.18 as well
https://lore.kernel.org/linux-iommu/[email protected]/
The Arm driver still touches the arch mm context after mmu_notifier_put().
I don't think X86 has that problem.
Thanks,
Jean
> So mm->mm_count are already counted in existing ARM and X86 binding and
> unbinding. The fix patch just frees the PASID in __mmdrop() after
> no more mm->mm_count.
>
> There is no need to add extra mmgrab() and mmdrop() pair.
>
> Thanks.
>
> -Fenghua
>
>
On 4/28/22 09:01, Jean-Philippe Brucker wrote:
>> But, this misses an important point: even after the address space is
>> gone, the PASID will still be programmed into a device. Device drivers
>> might, for instance, still need to flush operations that are outstanding
>> and need to use that PASID. They do this at ->release() time.
> It's not really clear which release() this is. For us it's file descriptor
> release() (not MMU notifier release(), which is how I initially understood
> this sentence)
OK, maybe that should be: "file->release() time" to make it more clear.
On Sat, Apr 30, 2022 at 03:33:17PM +0800, Baolu Lu wrote:
> Jean, another quick question about the iommu_sva_bind_device()
>
> /**
> * iommu_sva_bind_device() - Bind a process address space to a device
> * @dev: the device
> * @mm: the mm to bind, caller must hold a reference to it
> * @drvdata: opaque data pointer to pass to bind callback
>
> This interface requires the caller to take a reference to mm. Which
> reference should it take, mm->mm_count or mm->mm_users? It's better to
> make it explicit in this comment.
Agreed, it's mm_users as required by mmu_notifier_register()
Thanks,
Jean
On 2022/5/3 15:49, Jean-Philippe Brucker wrote:
> On Sat, Apr 30, 2022 at 03:33:17PM +0800, Baolu Lu wrote:
>> Jean, another quick question about the iommu_sva_bind_device()
>>
>> /**
>> * iommu_sva_bind_device() - Bind a process address space to a device
>> * @dev: the device
>> * @mm: the mm to bind, caller must hold a reference to it
>> * @drvdata: opaque data pointer to pass to bind callback
>>
>> This interface requires the caller to take a reference to mm. Which
>> reference should it take, mm->mm_count or mm->mm_users? It's better to
>> make it explicit in this comment.
>
> Agreed, it's mm_users as required by mmu_notifier_register()
Thanks! I will add this in my refactoring patch.
Best regards,
baolu