2020-07-06 00:30:55

by Lu Baolu

[permalink] [raw]
Subject: [PATCH v2 3/4] iommu/vt-d: Report page request faults for guest SVA

A pasid might be bound to a page table from a VM guest via the iommu
ops.sva_bind_gpasid. In this case, when a DMA page fault is detected
on the physical IOMMU, we need to inject the page fault request into
the guest. After the guest completes handling the page fault, a page
response need to be sent back via the iommu ops.page_response().

This adds support to report a page request fault. Any external module
which is interested in handling this fault should regiester a notifier
callback.

Co-developed-by: Jacob Pan <[email protected]>
Signed-off-by: Jacob Pan <[email protected]>
Co-developed-by: Liu Yi L <[email protected]>
Signed-off-by: Liu Yi L <[email protected]>
Signed-off-by: Lu Baolu <[email protected]>
---
drivers/iommu/intel/svm.c | 99 ++++++++++++++++++++++++++++++++-------
1 file changed, 81 insertions(+), 18 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index c23167877b2b..08c58c2b1a06 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -815,6 +815,57 @@ static void intel_svm_drain_prq(struct device *dev, int pasid)
}
}

+static int prq_to_iommu_prot(struct page_req_dsc *req)
+{
+ int prot = 0;
+
+ if (req->rd_req)
+ prot |= IOMMU_FAULT_PERM_READ;
+ if (req->wr_req)
+ prot |= IOMMU_FAULT_PERM_WRITE;
+ if (req->exe_req)
+ prot |= IOMMU_FAULT_PERM_EXEC;
+ if (req->pm_req)
+ prot |= IOMMU_FAULT_PERM_PRIV;
+
+ return prot;
+}
+
+static int
+intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
+{
+ struct iommu_fault_event event;
+ u8 bus, devfn;
+
+ memset(&event, 0, sizeof(struct iommu_fault_event));
+ bus = PCI_BUS_NUM(desc->rid);
+ devfn = desc->rid & 0xff;
+
+ /* Fill in event data for device specific processing */
+ event.fault.type = IOMMU_FAULT_PAGE_REQ;
+ event.fault.prm.addr = desc->addr;
+ event.fault.prm.pasid = desc->pasid;
+ event.fault.prm.grpid = desc->prg_index;
+ event.fault.prm.perm = prq_to_iommu_prot(desc);
+
+ /*
+ * Set last page in group bit if private data is present,
+ * page response is required as it does for LPIG.
+ */
+ if (desc->lpig)
+ event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+ if (desc->pasid_present)
+ event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+ if (desc->priv_data_present) {
+ event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+ event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
+ memcpy(event.fault.prm.private_data, desc->priv_data,
+ sizeof(desc->priv_data));
+ }
+
+ return iommu_report_device_fault(dev, &event);
+}
+
static irqreturn_t prq_event_thread(int irq, void *d)
{
struct intel_iommu *iommu = d;
@@ -828,7 +879,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
while (head != tail) {
- struct intel_svm_dev *sdev;
+ struct intel_svm_dev *sdev = NULL;
struct vm_area_struct *vma;
struct page_req_dsc *req;
struct qi_desc resp;
@@ -864,6 +915,20 @@ static irqreturn_t prq_event_thread(int irq, void *d)
}
}

+ if (!sdev || sdev->sid != req->rid) {
+ struct intel_svm_dev *t;
+
+ sdev = NULL;
+ rcu_read_lock();
+ list_for_each_entry_rcu(t, &svm->devs, list) {
+ if (t->sid == req->rid) {
+ sdev = t;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ }
+
result = QI_RESP_INVALID;
/* Since we're using init_mm.pgd directly, we should never take
* any faults on kernel addresses. */
@@ -874,6 +939,17 @@ static irqreturn_t prq_event_thread(int irq, void *d)
if (!is_canonical_address(address))
goto bad_req;

+ /*
+ * If prq is to be handled outside iommu driver via receiver of
+ * the fault notifiers, we skip the page response here.
+ */
+ if (svm->flags & SVM_FLAG_GUEST_MODE) {
+ if (sdev && !intel_svm_prq_report(sdev->dev, req))
+ goto prq_advance;
+ else
+ goto bad_req;
+ }
+
/* If the mm is already defunct, don't handle faults. */
if (!mmget_not_zero(svm->mm))
goto bad_req;
@@ -892,24 +968,10 @@ static irqreturn_t prq_event_thread(int irq, void *d)
goto invalid;

result = QI_RESP_SUCCESS;
- invalid:
+invalid:
mmap_read_unlock(svm->mm);
mmput(svm->mm);
- bad_req:
- /* Accounting for major/minor faults? */
- rcu_read_lock();
- list_for_each_entry_rcu(sdev, &svm->devs, list) {
- if (sdev->sid == req->rid)
- break;
- }
- /* Other devices can go away, but the drivers are not permitted
- * to unbind while any page faults might be in flight. So it's
- * OK to drop the 'lock' here now we have it. */
- rcu_read_unlock();
-
- if (WARN_ON(&sdev->list == &svm->devs))
- sdev = NULL;
-
+bad_req:
if (sdev && sdev->ops && sdev->ops->fault_cb) {
int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
(req->exe_req << 1) | (req->pm_req);
@@ -920,7 +982,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
and these can be NULL. Do not use them below this point! */
sdev = NULL;
svm = NULL;
- no_pasid:
+no_pasid:
if (req->lpig || req->priv_data_present) {
/*
* Per VT-d spec. v3.0 ch7.7, system software must
@@ -945,6 +1007,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
resp.qw3 = 0;
qi_submit_sync(iommu, &resp, 1, 0);
}
+prq_advance:
head = (head + sizeof(*req)) & PRQ_RING_MASK;
}

--
2.17.1


2020-07-06 01:31:15

by Tian, Kevin

[permalink] [raw]
Subject: RE: [PATCH v2 3/4] iommu/vt-d: Report page request faults for guest SVA

> From: Lu Baolu <[email protected]>
> Sent: Monday, July 6, 2020 8:26 AM
>
> A pasid might be bound to a page table from a VM guest via the iommu
> ops.sva_bind_gpasid. In this case, when a DMA page fault is detected
> on the physical IOMMU, we need to inject the page fault request into
> the guest. After the guest completes handling the page fault, a page
> response need to be sent back via the iommu ops.page_response().
>
> This adds support to report a page request fault. Any external module
> which is interested in handling this fault should regiester a notifier
> callback.

be specific on which notifier to be registered...

>
> Co-developed-by: Jacob Pan <[email protected]>
> Signed-off-by: Jacob Pan <[email protected]>
> Co-developed-by: Liu Yi L <[email protected]>
> Signed-off-by: Liu Yi L <[email protected]>
> Signed-off-by: Lu Baolu <[email protected]>
> ---
> drivers/iommu/intel/svm.c | 99 ++++++++++++++++++++++++++++++++-------
> 1 file changed, 81 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
> index c23167877b2b..08c58c2b1a06 100644
> --- a/drivers/iommu/intel/svm.c
> +++ b/drivers/iommu/intel/svm.c
> @@ -815,6 +815,57 @@ static void intel_svm_drain_prq(struct device *dev,
> int pasid)
> }
> }
>
> +static int prq_to_iommu_prot(struct page_req_dsc *req)
> +{
> + int prot = 0;
> +
> + if (req->rd_req)
> + prot |= IOMMU_FAULT_PERM_READ;
> + if (req->wr_req)
> + prot |= IOMMU_FAULT_PERM_WRITE;
> + if (req->exe_req)
> + prot |= IOMMU_FAULT_PERM_EXEC;
> + if (req->pm_req)
> + prot |= IOMMU_FAULT_PERM_PRIV;
> +
> + return prot;
> +}
> +
> +static int
> +intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
> +{
> + struct iommu_fault_event event;
> + u8 bus, devfn;
> +
> + memset(&event, 0, sizeof(struct iommu_fault_event));
> + bus = PCI_BUS_NUM(desc->rid);
> + devfn = desc->rid & 0xff;

not required.

> +
> + /* Fill in event data for device specific processing */
> + event.fault.type = IOMMU_FAULT_PAGE_REQ;
> + event.fault.prm.addr = desc->addr;
> + event.fault.prm.pasid = desc->pasid;
> + event.fault.prm.grpid = desc->prg_index;
> + event.fault.prm.perm = prq_to_iommu_prot(desc);
> +
> + /*
> + * Set last page in group bit if private data is present,
> + * page response is required as it does for LPIG.
> + */

move to priv_data_present check?

> + if (desc->lpig)
> + event.fault.prm.flags |=
> IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
> + if (desc->pasid_present)
> + event.fault.prm.flags |=
> IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
> + if (desc->priv_data_present) {
> + event.fault.prm.flags |=
> IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
> + event.fault.prm.flags |=
> IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
> + memcpy(event.fault.prm.private_data, desc->priv_data,
> + sizeof(desc->priv_data));
> + }
> +
> + return iommu_report_device_fault(dev, &event);
> +}
> +
> static irqreturn_t prq_event_thread(int irq, void *d)
> {
> struct intel_iommu *iommu = d;
> @@ -828,7 +879,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
> tail = dmar_readq(iommu->reg + DMAR_PQT_REG) &
> PRQ_RING_MASK;
> head = dmar_readq(iommu->reg + DMAR_PQH_REG) &
> PRQ_RING_MASK;
> while (head != tail) {
> - struct intel_svm_dev *sdev;
> + struct intel_svm_dev *sdev = NULL;

move to outside of the loop, otherwise later check always hit "if (!sdev)"

> struct vm_area_struct *vma;
> struct page_req_dsc *req;
> struct qi_desc resp;
> @@ -864,6 +915,20 @@ static irqreturn_t prq_event_thread(int irq, void *d)
> }
> }
>
> + if (!sdev || sdev->sid != req->rid) {
> + struct intel_svm_dev *t;
> +
> + sdev = NULL;
> + rcu_read_lock();
> + list_for_each_entry_rcu(t, &svm->devs, list) {
> + if (t->sid == req->rid) {
> + sdev = t;
> + break;
> + }
> + }
> + rcu_read_unlock();
> + }
> +
> result = QI_RESP_INVALID;
> /* Since we're using init_mm.pgd directly, we should never
> take
> * any faults on kernel addresses. */
> @@ -874,6 +939,17 @@ static irqreturn_t prq_event_thread(int irq, void *d)
> if (!is_canonical_address(address))
> goto bad_req;
>
> + /*
> + * If prq is to be handled outside iommu driver via receiver of
> + * the fault notifiers, we skip the page response here.
> + */
> + if (svm->flags & SVM_FLAG_GUEST_MODE) {
> + if (sdev && !intel_svm_prq_report(sdev->dev, req))
> + goto prq_advance;
> + else
> + goto bad_req;
> + }
> +
> /* If the mm is already defunct, don't handle faults. */
> if (!mmget_not_zero(svm->mm))
> goto bad_req;
> @@ -892,24 +968,10 @@ static irqreturn_t prq_event_thread(int irq, void *d)
> goto invalid;
>
> result = QI_RESP_SUCCESS;
> - invalid:
> +invalid:
> mmap_read_unlock(svm->mm);
> mmput(svm->mm);
> - bad_req:
> - /* Accounting for major/minor faults? */
> - rcu_read_lock();
> - list_for_each_entry_rcu(sdev, &svm->devs, list) {
> - if (sdev->sid == req->rid)
> - break;
> - }
> - /* Other devices can go away, but the drivers are not
> permitted
> - * to unbind while any page faults might be in flight. So it's
> - * OK to drop the 'lock' here now we have it. */

should we keep and move this comment to earlier sdev lookup? and
regarding to guest unbind, ae we preventing the fault owner (outside
of iommu driver) to unbind against in-flight fault request?

> - rcu_read_unlock();
> -
> - if (WARN_ON(&sdev->list == &svm->devs))
> - sdev = NULL;

similarly should we keep the WARN_ON check here?

> -
> +bad_req:
> if (sdev && sdev->ops && sdev->ops->fault_cb) {
> int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
> (req->exe_req << 1) | (req->pm_req);
> @@ -920,7 +982,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
> and these can be NULL. Do not use them below this point!
> */
> sdev = NULL;
> svm = NULL;
> - no_pasid:
> +no_pasid:
> if (req->lpig || req->priv_data_present) {
> /*
> * Per VT-d spec. v3.0 ch7.7, system software must
> @@ -945,6 +1007,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
> resp.qw3 = 0;
> qi_submit_sync(iommu, &resp, 1, 0);
> }
> +prq_advance:
> head = (head + sizeof(*req)) & PRQ_RING_MASK;
> }
>
> --
> 2.17.1

Thanks
Kevin

2020-07-06 01:38:22

by Tian, Kevin

[permalink] [raw]
Subject: RE: [PATCH v2 3/4] iommu/vt-d: Report page request faults for guest SVA

> From: Tian, Kevin
> Sent: Monday, July 6, 2020 9:30 AM
>
> > From: Lu Baolu <[email protected]>
> > Sent: Monday, July 6, 2020 8:26 AM
> >
> > A pasid might be bound to a page table from a VM guest via the iommu
> > ops.sva_bind_gpasid. In this case, when a DMA page fault is detected
> > on the physical IOMMU, we need to inject the page fault request into
> > the guest. After the guest completes handling the page fault, a page
> > response need to be sent back via the iommu ops.page_response().
> >
> > This adds support to report a page request fault. Any external module
> > which is interested in handling this fault should regiester a notifier
> > callback.
>
> be specific on which notifier to be registered...
>
> >
> > Co-developed-by: Jacob Pan <[email protected]>
> > Signed-off-by: Jacob Pan <[email protected]>
> > Co-developed-by: Liu Yi L <[email protected]>
> > Signed-off-by: Liu Yi L <[email protected]>
> > Signed-off-by: Lu Baolu <[email protected]>
> > ---
> > drivers/iommu/intel/svm.c | 99 ++++++++++++++++++++++++++++++++-----
> --
> > 1 file changed, 81 insertions(+), 18 deletions(-)
> >
> > diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
> > index c23167877b2b..08c58c2b1a06 100644
> > --- a/drivers/iommu/intel/svm.c
> > +++ b/drivers/iommu/intel/svm.c
> > @@ -815,6 +815,57 @@ static void intel_svm_drain_prq(struct device *dev,
> > int pasid)
> > }
> > }
> >
> > +static int prq_to_iommu_prot(struct page_req_dsc *req)
> > +{
> > + int prot = 0;
> > +
> > + if (req->rd_req)
> > + prot |= IOMMU_FAULT_PERM_READ;
> > + if (req->wr_req)
> > + prot |= IOMMU_FAULT_PERM_WRITE;
> > + if (req->exe_req)
> > + prot |= IOMMU_FAULT_PERM_EXEC;
> > + if (req->pm_req)
> > + prot |= IOMMU_FAULT_PERM_PRIV;
> > +
> > + return prot;
> > +}
> > +
> > +static int
> > +intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
> > +{
> > + struct iommu_fault_event event;
> > + u8 bus, devfn;
> > +
> > + memset(&event, 0, sizeof(struct iommu_fault_event));
> > + bus = PCI_BUS_NUM(desc->rid);
> > + devfn = desc->rid & 0xff;
>
> not required.
>
> > +
> > + /* Fill in event data for device specific processing */
> > + event.fault.type = IOMMU_FAULT_PAGE_REQ;
> > + event.fault.prm.addr = desc->addr;
> > + event.fault.prm.pasid = desc->pasid;
> > + event.fault.prm.grpid = desc->prg_index;
> > + event.fault.prm.perm = prq_to_iommu_prot(desc);
> > +
> > + /*
> > + * Set last page in group bit if private data is present,
> > + * page response is required as it does for LPIG.
> > + */
>
> move to priv_data_present check?
>
> > + if (desc->lpig)
> > + event.fault.prm.flags |=
> > IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
> > + if (desc->pasid_present)
> > + event.fault.prm.flags |=
> > IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
> > + if (desc->priv_data_present) {
> > + event.fault.prm.flags |=
> > IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;

btw earlier comment is more about the behavior of the fault
handler (e.g. the guest), but not about why we need convert
to last_page prm flag. Let's make it clear that doing so is
because iommu_report_device_fault doesn't understand this
vt-d specific requirement thus we set last_page as a workaround.

Thanks
Kevin

> > + event.fault.prm.flags |=
> > IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
> > + memcpy(event.fault.prm.private_data, desc->priv_data,
> > + sizeof(desc->priv_data));
> > + }
> > +
> > + return iommu_report_device_fault(dev, &event);
> > +}
> > +
> > static irqreturn_t prq_event_thread(int irq, void *d)
> > {
> > struct intel_iommu *iommu = d;
> > @@ -828,7 +879,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
> > tail = dmar_readq(iommu->reg + DMAR_PQT_REG) &
> > PRQ_RING_MASK;
> > head = dmar_readq(iommu->reg + DMAR_PQH_REG) &
> > PRQ_RING_MASK;
> > while (head != tail) {
> > - struct intel_svm_dev *sdev;
> > + struct intel_svm_dev *sdev = NULL;
>
> move to outside of the loop, otherwise later check always hit "if (!sdev)"
>
> > struct vm_area_struct *vma;
> > struct page_req_dsc *req;
> > struct qi_desc resp;
> > @@ -864,6 +915,20 @@ static irqreturn_t prq_event_thread(int irq, void
> *d)
> > }
> > }
> >
> > + if (!sdev || sdev->sid != req->rid) {
> > + struct intel_svm_dev *t;
> > +
> > + sdev = NULL;
> > + rcu_read_lock();
> > + list_for_each_entry_rcu(t, &svm->devs, list) {
> > + if (t->sid == req->rid) {
> > + sdev = t;
> > + break;
> > + }
> > + }
> > + rcu_read_unlock();
> > + }
> > +
> > result = QI_RESP_INVALID;
> > /* Since we're using init_mm.pgd directly, we should never
> > take
> > * any faults on kernel addresses. */
> > @@ -874,6 +939,17 @@ static irqreturn_t prq_event_thread(int irq, void
> *d)
> > if (!is_canonical_address(address))
> > goto bad_req;
> >
> > + /*
> > + * If prq is to be handled outside iommu driver via receiver of
> > + * the fault notifiers, we skip the page response here.
> > + */
> > + if (svm->flags & SVM_FLAG_GUEST_MODE) {
> > + if (sdev && !intel_svm_prq_report(sdev->dev, req))
> > + goto prq_advance;
> > + else
> > + goto bad_req;
> > + }
> > +
> > /* If the mm is already defunct, don't handle faults. */
> > if (!mmget_not_zero(svm->mm))
> > goto bad_req;
> > @@ -892,24 +968,10 @@ static irqreturn_t prq_event_thread(int irq, void
> *d)
> > goto invalid;
> >
> > result = QI_RESP_SUCCESS;
> > - invalid:
> > +invalid:
> > mmap_read_unlock(svm->mm);
> > mmput(svm->mm);
> > - bad_req:
> > - /* Accounting for major/minor faults? */
> > - rcu_read_lock();
> > - list_for_each_entry_rcu(sdev, &svm->devs, list) {
> > - if (sdev->sid == req->rid)
> > - break;
> > - }
> > - /* Other devices can go away, but the drivers are not
> > permitted
> > - * to unbind while any page faults might be in flight. So it's
> > - * OK to drop the 'lock' here now we have it. */
>
> should we keep and move this comment to earlier sdev lookup? and
> regarding to guest unbind, ae we preventing the fault owner (outside
> of iommu driver) to unbind against in-flight fault request?
>
> > - rcu_read_unlock();
> > -
> > - if (WARN_ON(&sdev->list == &svm->devs))
> > - sdev = NULL;
>
> similarly should we keep the WARN_ON check here?
>
> > -
> > +bad_req:
> > if (sdev && sdev->ops && sdev->ops->fault_cb) {
> > int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
> > (req->exe_req << 1) | (req->pm_req);
> > @@ -920,7 +982,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
> > and these can be NULL. Do not use them below this point!
> > */
> > sdev = NULL;
> > svm = NULL;
> > - no_pasid:
> > +no_pasid:
> > if (req->lpig || req->priv_data_present) {
> > /*
> > * Per VT-d spec. v3.0 ch7.7, system software must
> > @@ -945,6 +1007,7 @@ static irqreturn_t prq_event_thread(int irq, void
> *d)
> > resp.qw3 = 0;
> > qi_submit_sync(iommu, &resp, 1, 0);
> > }
> > +prq_advance:
> > head = (head + sizeof(*req)) & PRQ_RING_MASK;
> > }
> >
> > --
> > 2.17.1
>
> Thanks
> Kevin

2020-07-06 07:40:18

by Lu Baolu

[permalink] [raw]
Subject: Re: [PATCH v2 3/4] iommu/vt-d: Report page request faults for guest SVA

Hi Kevin,

On 2020/7/6 9:29, Tian, Kevin wrote:
>> From: Lu Baolu <[email protected]>
>> Sent: Monday, July 6, 2020 8:26 AM
>>
>> A pasid might be bound to a page table from a VM guest via the iommu
>> ops.sva_bind_gpasid. In this case, when a DMA page fault is detected
>> on the physical IOMMU, we need to inject the page fault request into
>> the guest. After the guest completes handling the page fault, a page
>> response need to be sent back via the iommu ops.page_response().
>>
>> This adds support to report a page request fault. Any external module
>> which is interested in handling this fault should regiester a notifier
>> callback.
>
> be specific on which notifier to be registered...

Sure.

>
>>
>> Co-developed-by: Jacob Pan <[email protected]>
>> Signed-off-by: Jacob Pan <[email protected]>
>> Co-developed-by: Liu Yi L <[email protected]>
>> Signed-off-by: Liu Yi L <[email protected]>
>> Signed-off-by: Lu Baolu <[email protected]>
>> ---
>> drivers/iommu/intel/svm.c | 99 ++++++++++++++++++++++++++++++++-------
>> 1 file changed, 81 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
>> index c23167877b2b..08c58c2b1a06 100644
>> --- a/drivers/iommu/intel/svm.c
>> +++ b/drivers/iommu/intel/svm.c
>> @@ -815,6 +815,57 @@ static void intel_svm_drain_prq(struct device *dev,
>> int pasid)
>> }
>> }
>>
>> +static int prq_to_iommu_prot(struct page_req_dsc *req)
>> +{
>> + int prot = 0;
>> +
>> + if (req->rd_req)
>> + prot |= IOMMU_FAULT_PERM_READ;
>> + if (req->wr_req)
>> + prot |= IOMMU_FAULT_PERM_WRITE;
>> + if (req->exe_req)
>> + prot |= IOMMU_FAULT_PERM_EXEC;
>> + if (req->pm_req)
>> + prot |= IOMMU_FAULT_PERM_PRIV;
>> +
>> + return prot;
>> +}
>> +
>> +static int
>> +intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
>> +{
>> + struct iommu_fault_event event;
>> + u8 bus, devfn;
>> +
>> + memset(&event, 0, sizeof(struct iommu_fault_event));
>> + bus = PCI_BUS_NUM(desc->rid);
>> + devfn = desc->rid & 0xff;
>
> not required.

Yes. Will remove them.

>
>> +
>> + /* Fill in event data for device specific processing */
>> + event.fault.type = IOMMU_FAULT_PAGE_REQ;
>> + event.fault.prm.addr = desc->addr;
>> + event.fault.prm.pasid = desc->pasid;
>> + event.fault.prm.grpid = desc->prg_index;
>> + event.fault.prm.perm = prq_to_iommu_prot(desc);
>> +
>> + /*
>> + * Set last page in group bit if private data is present,
>> + * page response is required as it does for LPIG.
>> + */
>
> move to priv_data_present check?

Yes.

>
>> + if (desc->lpig)
>> + event.fault.prm.flags |=
>> IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
>> + if (desc->pasid_present)
>> + event.fault.prm.flags |=
>> IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
>> + if (desc->priv_data_present) {
>> + event.fault.prm.flags |=
>> IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
>> + event.fault.prm.flags |=
>> IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
>> + memcpy(event.fault.prm.private_data, desc->priv_data,
>> + sizeof(desc->priv_data));
>> + }
>> +
>> + return iommu_report_device_fault(dev, &event);
>> +}
>> +
>> static irqreturn_t prq_event_thread(int irq, void *d)
>> {
>> struct intel_iommu *iommu = d;
>> @@ -828,7 +879,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
>> tail = dmar_readq(iommu->reg + DMAR_PQT_REG) &
>> PRQ_RING_MASK;
>> head = dmar_readq(iommu->reg + DMAR_PQH_REG) &
>> PRQ_RING_MASK;
>> while (head != tail) {
>> - struct intel_svm_dev *sdev;
>> + struct intel_svm_dev *sdev = NULL;
>
> move to outside of the loop, otherwise later check always hit "if (!sdev)"

Yes, good catch!

>
>> struct vm_area_struct *vma;
>> struct page_req_dsc *req;
>> struct qi_desc resp;
>> @@ -864,6 +915,20 @@ static irqreturn_t prq_event_thread(int irq, void *d)
>> }
>> }
>>
>> + if (!sdev || sdev->sid != req->rid) {
>> + struct intel_svm_dev *t;
>> +
>> + sdev = NULL;
>> + rcu_read_lock();
>> + list_for_each_entry_rcu(t, &svm->devs, list) {
>> + if (t->sid == req->rid) {
>> + sdev = t;
>> + break;
>> + }
>> + }
>> + rcu_read_unlock();
>> + }
>> +
>> result = QI_RESP_INVALID;
>> /* Since we're using init_mm.pgd directly, we should never
>> take
>> * any faults on kernel addresses. */
>> @@ -874,6 +939,17 @@ static irqreturn_t prq_event_thread(int irq, void *d)
>> if (!is_canonical_address(address))
>> goto bad_req;
>>
>> + /*
>> + * If prq is to be handled outside iommu driver via receiver of
>> + * the fault notifiers, we skip the page response here.
>> + */
>> + if (svm->flags & SVM_FLAG_GUEST_MODE) {
>> + if (sdev && !intel_svm_prq_report(sdev->dev, req))
>> + goto prq_advance;
>> + else
>> + goto bad_req;
>> + }
>> +
>> /* If the mm is already defunct, don't handle faults. */
>> if (!mmget_not_zero(svm->mm))
>> goto bad_req;
>> @@ -892,24 +968,10 @@ static irqreturn_t prq_event_thread(int irq, void *d)
>> goto invalid;
>>
>> result = QI_RESP_SUCCESS;
>> - invalid:
>> +invalid:
>> mmap_read_unlock(svm->mm);
>> mmput(svm->mm);
>> - bad_req:
>> - /* Accounting for major/minor faults? */
>> - rcu_read_lock();
>> - list_for_each_entry_rcu(sdev, &svm->devs, list) {
>> - if (sdev->sid == req->rid)
>> - break;
>> - }
>> - /* Other devices can go away, but the drivers are not
>> permitted
>> - * to unbind while any page faults might be in flight. So it's
>> - * OK to drop the 'lock' here now we have it. */
>
> should we keep and move this comment to earlier sdev lookup? and

I thought this comment explained why rcu_read_unlock() before the next
checking. In the new lookup code, we don't need to check, hence I
removed the comments.

> regarding to guest unbind, ae we preventing the fault owner (outside
> of iommu driver) to unbind against in-flight fault request?

Yes. We always wait until all prq with the same pasid completes in
gpasid_unbind().

>
>> - rcu_read_unlock();
>> -
>> - if (WARN_ON(&sdev->list == &svm->devs))
>> - sdev = NULL;
>
> similarly should we keep the WARN_ON check here?

Yes, agreed. We can keep a WARN_ON() here.

>
>> -
>> +bad_req:
>> if (sdev && sdev->ops && sdev->ops->fault_cb) {
>> int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
>> (req->exe_req << 1) | (req->pm_req);
>> @@ -920,7 +982,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
>> and these can be NULL. Do not use them below this point!
>> */
>> sdev = NULL;
>> svm = NULL;
>> - no_pasid:
>> +no_pasid:
>> if (req->lpig || req->priv_data_present) {
>> /*
>> * Per VT-d spec. v3.0 ch7.7, system software must
>> @@ -945,6 +1007,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
>> resp.qw3 = 0;
>> qi_submit_sync(iommu, &resp, 1, 0);
>> }
>> +prq_advance:
>> head = (head + sizeof(*req)) & PRQ_RING_MASK;
>> }
>>
>> --
>> 2.17.1
>
> Thanks
> Kevin
>

Best regards,
baolu

2020-07-06 07:50:44

by Lu Baolu

[permalink] [raw]
Subject: Re: [PATCH v2 3/4] iommu/vt-d: Report page request faults for guest SVA

On 2020/7/6 9:36, Tian, Kevin wrote:
>> From: Tian, Kevin
>> Sent: Monday, July 6, 2020 9:30 AM
>>
>>> From: Lu Baolu <[email protected]>
>>> Sent: Monday, July 6, 2020 8:26 AM
>>>
>>> A pasid might be bound to a page table from a VM guest via the iommu
>>> ops.sva_bind_gpasid. In this case, when a DMA page fault is detected
>>> on the physical IOMMU, we need to inject the page fault request into
>>> the guest. After the guest completes handling the page fault, a page
>>> response need to be sent back via the iommu ops.page_response().
>>>
>>> This adds support to report a page request fault. Any external module
>>> which is interested in handling this fault should regiester a notifier
>>> callback.
>>
>> be specific on which notifier to be registered...
>>
>>>
>>> Co-developed-by: Jacob Pan <[email protected]>
>>> Signed-off-by: Jacob Pan <[email protected]>
>>> Co-developed-by: Liu Yi L <[email protected]>
>>> Signed-off-by: Liu Yi L <[email protected]>
>>> Signed-off-by: Lu Baolu <[email protected]>
>>> ---
>>> drivers/iommu/intel/svm.c | 99 ++++++++++++++++++++++++++++++++-----
>> --
>>> 1 file changed, 81 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
>>> index c23167877b2b..08c58c2b1a06 100644
>>> --- a/drivers/iommu/intel/svm.c
>>> +++ b/drivers/iommu/intel/svm.c
>>> @@ -815,6 +815,57 @@ static void intel_svm_drain_prq(struct device *dev,
>>> int pasid)
>>> }
>>> }
>>>
>>> +static int prq_to_iommu_prot(struct page_req_dsc *req)
>>> +{
>>> + int prot = 0;
>>> +
>>> + if (req->rd_req)
>>> + prot |= IOMMU_FAULT_PERM_READ;
>>> + if (req->wr_req)
>>> + prot |= IOMMU_FAULT_PERM_WRITE;
>>> + if (req->exe_req)
>>> + prot |= IOMMU_FAULT_PERM_EXEC;
>>> + if (req->pm_req)
>>> + prot |= IOMMU_FAULT_PERM_PRIV;
>>> +
>>> + return prot;
>>> +}
>>> +
>>> +static int
>>> +intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
>>> +{
>>> + struct iommu_fault_event event;
>>> + u8 bus, devfn;
>>> +
>>> + memset(&event, 0, sizeof(struct iommu_fault_event));
>>> + bus = PCI_BUS_NUM(desc->rid);
>>> + devfn = desc->rid & 0xff;
>>
>> not required.
>>
>>> +
>>> + /* Fill in event data for device specific processing */
>>> + event.fault.type = IOMMU_FAULT_PAGE_REQ;
>>> + event.fault.prm.addr = desc->addr;
>>> + event.fault.prm.pasid = desc->pasid;
>>> + event.fault.prm.grpid = desc->prg_index;
>>> + event.fault.prm.perm = prq_to_iommu_prot(desc);
>>> +
>>> + /*
>>> + * Set last page in group bit if private data is present,
>>> + * page response is required as it does for LPIG.
>>> + */
>>
>> move to priv_data_present check?
>>
>>> + if (desc->lpig)
>>> + event.fault.prm.flags |=
>>> IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
>>> + if (desc->pasid_present)
>>> + event.fault.prm.flags |=
>>> IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
>>> + if (desc->priv_data_present) {
>>> + event.fault.prm.flags |=
>>> IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
>
> btw earlier comment is more about the behavior of the fault
> handler (e.g. the guest), but not about why we need convert
> to last_page prm flag. Let's make it clear that doing so is
> because iommu_report_device_fault doesn't understand this
> vt-d specific requirement thus we set last_page as a workaround.

Yes. I will add this in the comment.

Best regards,
baolu

>
> Thanks
> Kevin
>
>>> + event.fault.prm.flags |=
>>> IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
>>> + memcpy(event.fault.prm.private_data, desc->priv_data,
>>> + sizeof(desc->priv_data));
>>> + }
>>> +
>>> + return iommu_report_device_fault(dev, &event);
>>> +}
>>> +
>>> static irqreturn_t prq_event_thread(int irq, void *d)
>>> {
>>> struct intel_iommu *iommu = d;
>>> @@ -828,7 +879,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
>>> tail = dmar_readq(iommu->reg + DMAR_PQT_REG) &
>>> PRQ_RING_MASK;
>>> head = dmar_readq(iommu->reg + DMAR_PQH_REG) &
>>> PRQ_RING_MASK;
>>> while (head != tail) {
>>> - struct intel_svm_dev *sdev;
>>> + struct intel_svm_dev *sdev = NULL;
>>
>> move to outside of the loop, otherwise later check always hit "if (!sdev)"
>>
>>> struct vm_area_struct *vma;
>>> struct page_req_dsc *req;
>>> struct qi_desc resp;
>>> @@ -864,6 +915,20 @@ static irqreturn_t prq_event_thread(int irq, void
>> *d)
>>> }
>>> }
>>>
>>> + if (!sdev || sdev->sid != req->rid) {
>>> + struct intel_svm_dev *t;
>>> +
>>> + sdev = NULL;
>>> + rcu_read_lock();
>>> + list_for_each_entry_rcu(t, &svm->devs, list) {
>>> + if (t->sid == req->rid) {
>>> + sdev = t;
>>> + break;
>>> + }
>>> + }
>>> + rcu_read_unlock();
>>> + }
>>> +
>>> result = QI_RESP_INVALID;
>>> /* Since we're using init_mm.pgd directly, we should never
>>> take
>>> * any faults on kernel addresses. */
>>> @@ -874,6 +939,17 @@ static irqreturn_t prq_event_thread(int irq, void
>> *d)
>>> if (!is_canonical_address(address))
>>> goto bad_req;
>>>
>>> + /*
>>> + * If prq is to be handled outside iommu driver via receiver of
>>> + * the fault notifiers, we skip the page response here.
>>> + */
>>> + if (svm->flags & SVM_FLAG_GUEST_MODE) {
>>> + if (sdev && !intel_svm_prq_report(sdev->dev, req))
>>> + goto prq_advance;
>>> + else
>>> + goto bad_req;
>>> + }
>>> +
>>> /* If the mm is already defunct, don't handle faults. */
>>> if (!mmget_not_zero(svm->mm))
>>> goto bad_req;
>>> @@ -892,24 +968,10 @@ static irqreturn_t prq_event_thread(int irq, void
>> *d)
>>> goto invalid;
>>>
>>> result = QI_RESP_SUCCESS;
>>> - invalid:
>>> +invalid:
>>> mmap_read_unlock(svm->mm);
>>> mmput(svm->mm);
>>> - bad_req:
>>> - /* Accounting for major/minor faults? */
>>> - rcu_read_lock();
>>> - list_for_each_entry_rcu(sdev, &svm->devs, list) {
>>> - if (sdev->sid == req->rid)
>>> - break;
>>> - }
>>> - /* Other devices can go away, but the drivers are not
>>> permitted
>>> - * to unbind while any page faults might be in flight. So it's
>>> - * OK to drop the 'lock' here now we have it. */
>>
>> should we keep and move this comment to earlier sdev lookup? and
>> regarding to guest unbind, ae we preventing the fault owner (outside
>> of iommu driver) to unbind against in-flight fault request?
>>
>>> - rcu_read_unlock();
>>> -
>>> - if (WARN_ON(&sdev->list == &svm->devs))
>>> - sdev = NULL;
>>
>> similarly should we keep the WARN_ON check here?
>>
>>> -
>>> +bad_req:
>>> if (sdev && sdev->ops && sdev->ops->fault_cb) {
>>> int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
>>> (req->exe_req << 1) | (req->pm_req);
>>> @@ -920,7 +982,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
>>> and these can be NULL. Do not use them below this point!
>>> */
>>> sdev = NULL;
>>> svm = NULL;
>>> - no_pasid:
>>> +no_pasid:
>>> if (req->lpig || req->priv_data_present) {
>>> /*
>>> * Per VT-d spec. v3.0 ch7.7, system software must
>>> @@ -945,6 +1007,7 @@ static irqreturn_t prq_event_thread(int irq, void
>> *d)
>>> resp.qw3 = 0;
>>> qi_submit_sync(iommu, &resp, 1, 0);
>>> }
>>> +prq_advance:
>>> head = (head + sizeof(*req)) & PRQ_RING_MASK;
>>> }
>>>
>>> --
>>> 2.17.1
>>
>> Thanks
>> Kevin

2020-07-07 11:26:33

by Jean-Philippe Brucker

[permalink] [raw]
Subject: Re: [PATCH v2 3/4] iommu/vt-d: Report page request faults for guest SVA

On Mon, Jul 06, 2020 at 08:25:34AM +0800, Lu Baolu wrote:
> A pasid might be bound to a page table from a VM guest via the iommu
> ops.sva_bind_gpasid. In this case, when a DMA page fault is detected
> on the physical IOMMU, we need to inject the page fault request into
> the guest. After the guest completes handling the page fault, a page
> response need to be sent back via the iommu ops.page_response().
>
> This adds support to report a page request fault. Any external module
> which is interested in handling this fault should regiester a notifier
> callback.
>
> Co-developed-by: Jacob Pan <[email protected]>
> Signed-off-by: Jacob Pan <[email protected]>
> Co-developed-by: Liu Yi L <[email protected]>
> Signed-off-by: Liu Yi L <[email protected]>
> Signed-off-by: Lu Baolu <[email protected]>
> ---
[...]
> +static int
> +intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
> +{
> + struct iommu_fault_event event;
> + u8 bus, devfn;
> +
> + memset(&event, 0, sizeof(struct iommu_fault_event));
> + bus = PCI_BUS_NUM(desc->rid);
> + devfn = desc->rid & 0xff;
> +
> + /* Fill in event data for device specific processing */
> + event.fault.type = IOMMU_FAULT_PAGE_REQ;
> + event.fault.prm.addr = desc->addr;
> + event.fault.prm.pasid = desc->pasid;
> + event.fault.prm.grpid = desc->prg_index;
> + event.fault.prm.perm = prq_to_iommu_prot(desc);
> +
> + /*
> + * Set last page in group bit if private data is present,
> + * page response is required as it does for LPIG.
> + */
> + if (desc->lpig)
> + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
> + if (desc->pasid_present)
> + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;

Do you also need to set IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID? I added
the flag to deal with devices that do not want a PASID value in their PRI
response (bit 15 in the PCIe Page Request Status Register):
https://lore.kernel.org/linux-iommu/[email protected]/
(applied by Joerg for v5.9)

Grepping for pci_prg_resp_pasid_required() in intel/iommu.c it seems to
currently reject devices that do not want a PASID in a PRI response, so I
think you can set this flag unconditionally for now.

Thanks,
Jean

> + if (desc->priv_data_present) {
> + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
> + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
> + memcpy(event.fault.prm.private_data, desc->priv_data,
> + sizeof(desc->priv_data));
> + }
> +
> + return iommu_report_device_fault(dev, &event);
> +}

2020-07-08 02:20:13

by Lu Baolu

[permalink] [raw]
Subject: Re: [PATCH v2 3/4] iommu/vt-d: Report page request faults for guest SVA

Hi Jean,

On 7/7/20 7:23 PM, Jean-Philippe Brucker wrote:
> On Mon, Jul 06, 2020 at 08:25:34AM +0800, Lu Baolu wrote:
>> A pasid might be bound to a page table from a VM guest via the iommu
>> ops.sva_bind_gpasid. In this case, when a DMA page fault is detected
>> on the physical IOMMU, we need to inject the page fault request into
>> the guest. After the guest completes handling the page fault, a page
>> response need to be sent back via the iommu ops.page_response().
>>
>> This adds support to report a page request fault. Any external module
>> which is interested in handling this fault should regiester a notifier
>> callback.
>>
>> Co-developed-by: Jacob Pan <[email protected]>
>> Signed-off-by: Jacob Pan <[email protected]>
>> Co-developed-by: Liu Yi L <[email protected]>
>> Signed-off-by: Liu Yi L <[email protected]>
>> Signed-off-by: Lu Baolu <[email protected]>
>> ---
> [...]
>> +static int
>> +intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
>> +{
>> + struct iommu_fault_event event;
>> + u8 bus, devfn;
>> +
>> + memset(&event, 0, sizeof(struct iommu_fault_event));
>> + bus = PCI_BUS_NUM(desc->rid);
>> + devfn = desc->rid & 0xff;
>> +
>> + /* Fill in event data for device specific processing */
>> + event.fault.type = IOMMU_FAULT_PAGE_REQ;
>> + event.fault.prm.addr = desc->addr;
>> + event.fault.prm.pasid = desc->pasid;
>> + event.fault.prm.grpid = desc->prg_index;
>> + event.fault.prm.perm = prq_to_iommu_prot(desc);
>> +
>> + /*
>> + * Set last page in group bit if private data is present,
>> + * page response is required as it does for LPIG.
>> + */
>> + if (desc->lpig)
>> + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
>> + if (desc->pasid_present)
>> + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
>
> Do you also need to set IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID? I added
> the flag to deal with devices that do not want a PASID value in their PRI
> response (bit 15 in the PCIe Page Request Status Register):
> https://lore.kernel.org/linux-iommu/[email protected]/
> (applied by Joerg for v5.9)
>
> Grepping for pci_prg_resp_pasid_required() in intel/iommu.c it seems to
> currently reject devices that do not want a PASID in a PRI response, so I
> think you can set this flag unconditionally for now.

Yes. You are right. I will set this flag in the next version.

Best regards,
baolu

>
> Thanks,
> Jean
>
>> + if (desc->priv_data_present) {
>> + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
>> + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
>> + memcpy(event.fault.prm.private_data, desc->priv_data,
>> + sizeof(desc->priv_data));
>> + }
>> +
>> + return iommu_report_device_fault(dev, &event);
>> +}