Guest shared virtual address (SVA) may require host to shadow guest
PASID tables. Guest PASID can also be allocated from the host via
enlightened interfaces. In this case, guest needs to bind the guest
mm, i.e. cr3 in guest physical address to the actual PASID table in
the host IOMMU. Nesting will be turned on such that guest virtual
address can go through a two level translation:
- 1st level translates GVA to GPA
- 2nd level translates GPA to HPA
This patch introduces APIs to bind guest PASID data to the assigned
device entry in the physical IOMMU. See the diagram below for usage
explaination.
.-------------. .---------------------------.
| vIOMMU | | Guest process mm, FL only |
| | '---------------------------'
.----------------/
| PASID Entry |--- PASID cache flush -
'-------------' |
| | V
| |
'-------------'
Guest
------| Shadow |--------------------------|------------
v v v
Host
.-------------. .----------------------.
| pIOMMU | | Bind FL for GVA-GPA |
| | '----------------------'
.----------------/ |
| PASID Entry | V (Nested xlate)
'----------------\.---------------------.
| | |Set SL to GPA-HPA |
| | '---------------------'
'-------------'
Where:
- FL = First level/stage one page tables
- SL = Second level/stage two page tables
Signed-off-by: Jacob Pan <[email protected]>
Signed-off-by: Liu Yi L <[email protected]>
---
drivers/iommu/iommu.c | 20 ++++++++++++++++++++
include/linux/iommu.h | 10 ++++++++++
include/uapi/linux/iommu.h | 15 ++++++++++++++-
3 files changed, 44 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index a2f6f3e..f8572d2 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1659,6 +1659,26 @@ int iommu_cache_invalidate(struct iommu_domain *domain, struct device *dev,
}
EXPORT_SYMBOL_GPL(iommu_cache_invalidate);
+int iommu_sva_bind_gpasid(struct iommu_domain *domain,
+ struct device *dev, struct gpasid_bind_data *data)
+{
+ if (unlikely(!domain->ops->sva_bind_gpasid))
+ return -ENODEV;
+
+ return domain->ops->sva_bind_gpasid(domain, dev, data);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_bind_gpasid);
+
+int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
+ int pasid)
+{
+ if (unlikely(!domain->ops->sva_unbind_gpasid))
+ return -ENODEV;
+
+ return domain->ops->sva_unbind_gpasid(dev, pasid);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid);
+
static void __iommu_detach_device(struct iommu_domain *domain,
struct device *dev)
{
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d182525..9a69b59 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -268,6 +268,8 @@ struct page_response_msg {
* @detach_pasid_table: detach the pasid table
* @cache_invalidate: invalidate translation caches
* @pgsize_bitmap: bitmap of all possible supported page sizes
+ * @sva_bind_gpasid: bind guest pasid and mm
+ * @sva_unbind_gpasid: unbind guest pasid and mm
*/
struct iommu_ops {
bool (*capable)(enum iommu_cap);
@@ -332,6 +334,10 @@ struct iommu_ops {
int (*page_response)(struct device *dev, struct page_response_msg *msg);
int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev,
struct iommu_cache_invalidate_info *inv_info);
+ int (*sva_bind_gpasid)(struct iommu_domain *domain,
+ struct device *dev, struct gpasid_bind_data *data);
+
+ int (*sva_unbind_gpasid)(struct device *dev, int pasid);
unsigned long pgsize_bitmap;
};
@@ -447,6 +453,10 @@ extern void iommu_detach_pasid_table(struct iommu_domain *domain);
extern int iommu_cache_invalidate(struct iommu_domain *domain,
struct device *dev,
struct iommu_cache_invalidate_info *inv_info);
+extern int iommu_sva_bind_gpasid(struct iommu_domain *domain,
+ struct device *dev, struct gpasid_bind_data *data);
+extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
+ struct device *dev, int pasid);
extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index fa96ecb..3a781df 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -240,6 +240,19 @@ struct iommu_cache_invalidate_info {
struct iommu_inv_addr_info addr_info;
};
};
-
+/**
+ * struct gpasid_bind_data - Information about device and guest PASID binding
+ * @gcr3: Guest CR3 value from guest mm
+ * @pasid: Process address space ID used for the guest mm
+ * @addr_width: Guest address width. Paging mode can also be derived.
+ */
+struct gpasid_bind_data {
+ __u64 gcr3;
+ __u32 pasid;
+ __u32 addr_width;
+ __u32 flags;
+#define IOMMU_SVA_GPASID_SRE BIT(0) /* supervisor request */
+ __u8 padding[4];
+};
#endif /* _UAPI_IOMMU_H */
--
2.7.4
Hi Jacob,
On 03/05/2019 23:32, Jacob Pan wrote:
> +/**
> + * struct gpasid_bind_data - Information about device and guest PASID binding
> + * @gcr3: Guest CR3 value from guest mm
> + * @pasid: Process address space ID used for the guest mm
> + * @addr_width: Guest address width. Paging mode can also be derived.
> + */
> +struct gpasid_bind_data {
> + __u64 gcr3;
> + __u32 pasid;
> + __u32 addr_width;
> + __u32 flags;
> +#define IOMMU_SVA_GPASID_SRE BIT(0) /* supervisor request */
> + __u8 padding[4];
> +};
Could you wrap this structure into a generic one like we now do for
bind_pasid_table? It would make the API easier to extend, because if we
ever add individual PASID bind on Arm (something I'd like to do for
virtio-iommu, eventually) it will have different parameters, as our
PASID table entry has a lot of fields describing the page table format.
Maybe something like the following would do?
struct gpasid_bind_data {
#define IOMMU_GPASID_BIND_VERSION_1 1
__u32 version;
#define IOMMU_GPASID_BIND_FORMAT_INTEL_VTD 1
__u32 format;
union {
// the current gpasid_bind_data:
struct gpasid_bind_intel_vtd vtd;
};
};
Thanks,
Jean
On Thu, 16 May 2019 15:14:40 +0100
Jean-Philippe Brucker <[email protected]> wrote:
> Hi Jacob,
>
> On 03/05/2019 23:32, Jacob Pan wrote:
> > +/**
> > + * struct gpasid_bind_data - Information about device and guest
> > PASID binding
> > + * @gcr3: Guest CR3 value from guest mm
> > + * @pasid: Process address space ID used for the guest mm
> > + * @addr_width: Guest address width. Paging mode can also
> > be derived.
> > + */
> > +struct gpasid_bind_data {
> > + __u64 gcr3;
> > + __u32 pasid;
> > + __u32 addr_width;
> > + __u32 flags;
> > +#define IOMMU_SVA_GPASID_SRE BIT(0) /* supervisor
> > request */
> > + __u8 padding[4];
> > +};
>
> Could you wrap this structure into a generic one like we now do for
> bind_pasid_table? It would make the API easier to extend, because if
> we ever add individual PASID bind on Arm (something I'd like to do for
> virtio-iommu, eventually) it will have different parameters, as our
> PASID table entry has a lot of fields describing the page table
> format.
>
> Maybe something like the following would do?
>
> struct gpasid_bind_data {
> #define IOMMU_GPASID_BIND_VERSION_1 1
> __u32 version;
> #define IOMMU_GPASID_BIND_FORMAT_INTEL_VTD 1
> __u32 format;
> union {
> // the current gpasid_bind_data:
> struct gpasid_bind_intel_vtd vtd;
> };
> };
>
OK, sounds great.
On Thu, 16 May 2019 09:14:29 -0700
Jacob Pan <[email protected]> wrote:
> On Thu, 16 May 2019 15:14:40 +0100
> Jean-Philippe Brucker <[email protected]> wrote:
>
> > Hi Jacob,
> >
> > On 03/05/2019 23:32, Jacob Pan wrote:
> > > +/**
> > > + * struct gpasid_bind_data - Information about device and guest
> > > PASID binding
> > > + * @gcr3: Guest CR3 value from guest mm
> > > + * @pasid: Process address space ID used for the guest mm
> > > + * @addr_width: Guest address width. Paging mode can also
> > > be derived.
> > > + */
> > > +struct gpasid_bind_data {
> > > + __u64 gcr3;
> > > + __u32 pasid;
> > > + __u32 addr_width;
> > > + __u32 flags;
> > > +#define IOMMU_SVA_GPASID_SRE BIT(0) /* supervisor
> > > request */
> > > + __u8 padding[4];
> > > +};
> >
> > Could you wrap this structure into a generic one like we now do for
> > bind_pasid_table? It would make the API easier to extend, because if
> > we ever add individual PASID bind on Arm (something I'd like to do
> > for virtio-iommu, eventually) it will have different parameters, as
> > our PASID table entry has a lot of fields describing the page table
> > format.
> >
> > Maybe something like the following would do?
> >
> > struct gpasid_bind_data {
> > #define IOMMU_GPASID_BIND_VERSION_1 1
> > __u32 version;
> > #define IOMMU_GPASID_BIND_FORMAT_INTEL_VTD 1
> > __u32 format;
> > union {
> > // the current gpasid_bind_data:
> > struct gpasid_bind_intel_vtd vtd;
> > };
> > };
> >
Could you review the struct below? I am trying to extract the
common fileds as much as possible. Didn't do exactly as you suggested
to keep vendor specific data in separate struct under the same union.
Also, can you review the v3 ioasid allocator common code patches? I am
hoping we can get the common code in v5.3 so that we can focus on the
vendor specific part. The common code should include bind_guest_pasid
and ioasid allocator.
https://lkml.org/lkml/2019/5/3/787
https://lkml.org/lkml/2019/5/3/780
Thanks,
Jacob
/**
* struct gpasid_bind_data_vtd - Intel VT-d specific data on device and guest
* SVA binding.
*
* @flags: VT-d PASID table entry attributes
* @pat: Page attribute table data to compute effective memory type
* @emt: Extended memory type
*
* Only guest vIOMMU selectable and effective options are passed down to
* the host IOMMU.
*/
struct gpasid_bind_data_vtd {
#define IOMMU_SVA_VTD_GPASID_SRE BIT(0) /* supervisor request */
#define IOMMU_SVA_VTD_GPASID_EAFE BIT(1) /* extended access enable */
#define IOMMU_SVA_VTD_GPASID_PCD BIT(2) /* page-level cache disable */
#define IOMMU_SVA_VTD_GPASID_PWT BIT(3) /* page-level write through */
#define IOMMU_SVA_VTD_GPASID_EMTE BIT(4) /* extended memory type enable */
#define IOMMU_SVA_VTD_GPASID_CD BIT(5) /* PASID-level cache disable */
__u64 flags;
__u32 pat;
__u32 emt;
};
/**
* struct gpasid_bind_data - Information about device and guest PASID binding
* @version: Version of this data structure
* @format: PASID table entry format
* @flags: Additional information on guest bind request
* @gpgd: Guest page directory base of the guest mm to bind
* @hpasid: Process address space ID used for the guest mm in host IOMMU
* @gpasid: Process address space ID used for the guest mm in guest IOMMU
* @addr_width: Guest address width. Paging mode can also be derived.
* @vtd: Intel VT-d specific data
*/
struct gpasid_bind_data {
#define IOMMU_GPASID_BIND_VERSION_1 1
__u32 version;
#define IOMMU_PASID_FORMAT_INTEL_VTD 1
__u32 format;
#define IOMMU_SVA_GPASID_VAL BIT(1) /* guest PASID valid */
__u64 flags;
__u64 gpgd;
__u64 hpasid;
__u64 gpasid;
__u32 addr_width;
/* Vendor specific data */
union {
struct gpasid_bind_data_vtd vtd;
};
};
On 20/05/2019 20:22, Jacob Pan wrote:
> On Thu, 16 May 2019 09:14:29 -0700
> Jacob Pan <[email protected]> wrote:
>
>> On Thu, 16 May 2019 15:14:40 +0100
>> Jean-Philippe Brucker <[email protected]> wrote:
>>
>>> Hi Jacob,
>>>
>>> On 03/05/2019 23:32, Jacob Pan wrote:
>>>> +/**
>>>> + * struct gpasid_bind_data - Information about device and guest
>>>> PASID binding
>>>> + * @gcr3: Guest CR3 value from guest mm
>>>> + * @pasid: Process address space ID used for the guest mm
>>>> + * @addr_width: Guest address width. Paging mode can also
>>>> be derived.
>>>> + */
>>>> +struct gpasid_bind_data {
>>>> + __u64 gcr3;
>>>> + __u32 pasid;
>>>> + __u32 addr_width;
>>>> + __u32 flags;
>>>> +#define IOMMU_SVA_GPASID_SRE BIT(0) /* supervisor
>>>> request */
>>>> + __u8 padding[4];
>>>> +};
>>>
>>> Could you wrap this structure into a generic one like we now do for
>>> bind_pasid_table? It would make the API easier to extend, because if
>>> we ever add individual PASID bind on Arm (something I'd like to do
>>> for virtio-iommu, eventually) it will have different parameters, as
>>> our PASID table entry has a lot of fields describing the page table
>>> format.
>>>
>>> Maybe something like the following would do?
>>>
>>> struct gpasid_bind_data {
>>> #define IOMMU_GPASID_BIND_VERSION_1 1
>>> __u32 version;
>>> #define IOMMU_GPASID_BIND_FORMAT_INTEL_VTD 1
>>> __u32 format;
>>> union {
>>> // the current gpasid_bind_data:
>>> struct gpasid_bind_intel_vtd vtd;
>>> };
>>> };
>>>
>
> Could you review the struct below? I am trying to extract the
> common fileds as much as possible. Didn't do exactly as you suggested
> to keep vendor specific data in separate struct under the same union.
Thanks, it looks good and I think we can reuse it for SMMUv2 and v3.
Some comments below.
>
> Also, can you review the v3 ioasid allocator common code patches? I am
> hoping we can get the common code in v5.3 so that we can focus on the
> vendor specific part. The common code should include bind_guest_pasid
> and ioasid allocator.
> https://lkml.org/lkml/2019/5/3/787
> https://lkml.org/lkml/2019/5/3/780
>
> Thanks,
>
> Jacob
>
>
> /**
> * struct gpasid_bind_data_vtd - Intel VT-d specific data on device and guest
> * SVA binding.
> *
> * @flags: VT-d PASID table entry attributes
> * @pat: Page attribute table data to compute effective memory type
> * @emt: Extended memory type
> *
> * Only guest vIOMMU selectable and effective options are passed down to
> * the host IOMMU.
> */
> struct gpasid_bind_data_vtd {
> #define IOMMU_SVA_VTD_GPASID_SRE BIT(0) /* supervisor request */
> #define IOMMU_SVA_VTD_GPASID_EAFE BIT(1) /* extended access enable */
> #define IOMMU_SVA_VTD_GPASID_PCD BIT(2) /* page-level cache disable */
> #define IOMMU_SVA_VTD_GPASID_PWT BIT(3) /* page-level write through */
> #define IOMMU_SVA_VTD_GPASID_EMTE BIT(4) /* extended memory type enable */
> #define IOMMU_SVA_VTD_GPASID_CD BIT(5) /* PASID-level cache disable */
It doesn't seem like the BIT() macro is exported to userspace, so we
can't use it here
> __u64 flags;
> __u32 pat;
> __u32 emt;
> };
>
> /**
> * struct gpasid_bind_data - Information about device and guest PASID binding
> * @version: Version of this data structure
> * @format: PASID table entry format
> * @flags: Additional information on guest bind request
> * @gpgd: Guest page directory base of the guest mm to bind
> * @hpasid: Process address space ID used for the guest mm in host IOMMU
> * @gpasid: Process address space ID used for the guest mm in guest IOMMU
Trying to understand the full flow:
* @gpasid is the one allocated by the guest using a virtual command. The
guest writes @gpgd into the virtual PASID table at index @gpasid, then
sends an invalidate command to QEMU.
* QEMU issues a gpasid_bind ioctl (on the mdev or its container?). VFIO
forwards. The IOMMU driver installs @gpgd into the PASID table using
@hpasid, which is associated with the auxiliary domain.
But why do we need the @hpasid field here? Does userspace know about it
at all, and does VFIO need to pass it to the IOMMU driver?
> * @addr_width: Guest address width. Paging mode can also be derived.
What does the last sentence mean? @addr_width should probably be in @vtd
if it provides implicit information.
> * @vtd: Intel VT-d specific data
> */
> struct gpasid_bind_data {
> #define IOMMU_GPASID_BIND_VERSION_1 1
> __u32 version;
> #define IOMMU_PASID_FORMAT_INTEL_VTD 1
> __u32 format;
> #define IOMMU_SVA_GPASID_VAL BIT(1) /* guest PASID valid */
(There are tabs between define and name here, as well as in the VT-d
specific data)
> __u64 flags;
> __u64 gpgd;
> __u64 hpasid;
> __u64 gpasid;
> __u32 addr_width;
I think the union has to be aligned on 64-bit, otherwise a compiler
might insert padding (https://lkml.org/lkml/2019/1/11/1207)
Thanks,
Jean
> /* Vendor specific data */
> union {
> struct gpasid_bind_data_vtd vtd;
> };
> };
>
>
On Tue, 21 May 2019 17:09:40 +0100
Jean-Philippe Brucker <[email protected]> wrote:
> On 20/05/2019 20:22, Jacob Pan wrote:
> > On Thu, 16 May 2019 09:14:29 -0700
> > Jacob Pan <[email protected]> wrote:
> >
> >> On Thu, 16 May 2019 15:14:40 +0100
> >> Jean-Philippe Brucker <[email protected]> wrote:
> >>
> >>> Hi Jacob,
> >>>
> >>> On 03/05/2019 23:32, Jacob Pan wrote:
> >>>> +/**
> >>>> + * struct gpasid_bind_data - Information about device and guest
> >>>> PASID binding
> >>>> + * @gcr3: Guest CR3 value from guest mm
> >>>> + * @pasid: Process address space ID used for the guest mm
> >>>> + * @addr_width: Guest address width. Paging mode can also
> >>>> be derived.
> >>>> + */
> >>>> +struct gpasid_bind_data {
> >>>> + __u64 gcr3;
> >>>> + __u32 pasid;
> >>>> + __u32 addr_width;
> >>>> + __u32 flags;
> >>>> +#define IOMMU_SVA_GPASID_SRE BIT(0) /* supervisor
> >>>> request */
> >>>> + __u8 padding[4];
> >>>> +};
> >>>
> >>> Could you wrap this structure into a generic one like we now do
> >>> for bind_pasid_table? It would make the API easier to extend,
> >>> because if we ever add individual PASID bind on Arm (something
> >>> I'd like to do for virtio-iommu, eventually) it will have
> >>> different parameters, as our PASID table entry has a lot of
> >>> fields describing the page table format.
> >>>
> >>> Maybe something like the following would do?
> >>>
> >>> struct gpasid_bind_data {
> >>> #define IOMMU_GPASID_BIND_VERSION_1 1
> >>> __u32 version;
> >>> #define IOMMU_GPASID_BIND_FORMAT_INTEL_VTD 1
> >>> __u32 format;
> >>> union {
> >>> // the current gpasid_bind_data:
> >>> struct gpasid_bind_intel_vtd vtd;
> >>> };
> >>> };
> >>>
> >
> > Could you review the struct below? I am trying to extract the
> > common fileds as much as possible. Didn't do exactly as you
> > suggested to keep vendor specific data in separate struct under the
> > same union.
>
> Thanks, it looks good and I think we can reuse it for SMMUv2 and v3.
> Some comments below.
>
> >
> > Also, can you review the v3 ioasid allocator common code patches? I
> > am hoping we can get the common code in v5.3 so that we can focus
> > on the vendor specific part. The common code should include
> > bind_guest_pasid and ioasid allocator.
> > https://lkml.org/lkml/2019/5/3/787
> > https://lkml.org/lkml/2019/5/3/780
> >
> > Thanks,
> >
> > Jacob
> >
> >
> > /**
> > * struct gpasid_bind_data_vtd - Intel VT-d specific data on device
> > and guest
> > * SVA binding.
> > *
> > * @flags: VT-d PASID table entry attributes
> > * @pat: Page attribute table data to compute effective
> > memory type
> > * @emt: Extended memory type
> > *
> > * Only guest vIOMMU selectable and effective options are passed
> > down to
> > * the host IOMMU.
> > */
> > struct gpasid_bind_data_vtd {
> > #define IOMMU_SVA_VTD_GPASID_SRE BIT(0) /* supervisor
> > request */ #define IOMMU_SVA_VTD_GPASID_EAFE
> > BIT(1) /* extended access enable */ #define
> > IOMMU_SVA_VTD_GPASID_PCD BIT(2) /* page-level cache disable
> > */ #define IOMMU_SVA_VTD_GPASID_PWT BIT(3) /*
> > page-level write through */ #define
> > IOMMU_SVA_VTD_GPASID_EMTE BIT(4) /* extended memory type
> > enable */ #define IOMMU_SVA_VTD_GPASID_CD
> > BIT(5) /* PASID-level cache disable */
>
> It doesn't seem like the BIT() macro is exported to userspace, so we
> can't use it here
>
good point, will avoid BIT()
> > __u64 flags;
> > __u32 pat;
> > __u32 emt;
> > };
> >
> > /**
> > * struct gpasid_bind_data - Information about device and guest
> > PASID binding
> > * @version: Version of this data structure
> > * @format: PASID table entry format
> > * @flags: Additional information on guest bind request
> > * @gpgd: Guest page directory base of the guest mm to bind
> > * @hpasid: Process address space ID used for the guest mm
> > in host IOMMU
> > * @gpasid: Process address space ID used for the guest mm
> > in guest IOMMU
>
> Trying to understand the full flow:
> * @gpasid is the one allocated by the guest using a virtual command.
> The guest writes @gpgd into the virtual PASID table at index @gpasid,
> then sends an invalidate command to QEMU.
yes
> * QEMU issues a gpasid_bind ioctl (on the mdev or its container?).
> VFIO forwards. The IOMMU driver installs @gpgd into the PASID table
> using @hpasid, which is associated with the auxiliary domain.
>
> But why do we need the @hpasid field here? Does userspace know about
> it at all, and does VFIO need to pass it to the IOMMU driver?
>
We need to support two guest-host PASID mappings through this API. Idea
comes from Kevin & Yi.
1. identity mapping between host and guest PASID
2. guest owns its own pasid space
For option 1, which will plan to support first in this series. There is
no need for gpasid field since gpasid=hpasid. Guest allocates PASID
using virtual command interface which gets a host PASID. Then PASID
cache invalidation in the guest will result in bind_gpasid(), @gpasid is
not valid in the bind data (indicated by the IOMMU_SVA_GPASID_VAL flag).
For option 2, guest still uses virtual command to allocate guest pasid,
but this time QEMU does the allocation for gpasid, at the same time
QEMU will allocate a host pasid then maintain a G->H PASID lookup.
When guest invalidate its PASID cache with GPASID, QEMU will find the
match host PASID then pass both gpasid and hpasid down to the host IOMMU
driver.
Host IOMMU driver will store the gpgd at the hpasid entry but keep
track of the gpasid->hpasid mapping. Host will never program gpasid in
the IOMMU HW. Host IOMMU driver provides G->H PASID translation for PF
device drivers that emulates mdev config space, i.e. virtual device
composition module
(https://events.linuxfoundation.org/wp-content/uploads/2017/12/Hardware-Assisted-Mediated-Pass-Through-with-VFIO-Kevin-Tian-Intel.pdf).
These two options is a per VM choice. Hopefully the two diagrams below
can help to explain. I will put them in the next patch headers.
Option 1. Identity G-H PASID mapping diagram.
.-------------. .---------------------------.
| vIOMMU | | Guest process mm, FL only |
| | '---------------------------'
.----------------/
| PASID Entry |--- PASID cache flush -
'-------------'\ |
| | \ |
| | \ |
'-------------' \________________ |
GPASID = HPASID |
Guest ^ ^ |
------| Shadow |-------| VCMD |-----------|------------
v v | | |
QEMU v v |
------------------------------------------|------------
Host HPASID = ioasid_alloc() |
| v
| sva_bind_gpasid(HPASID)
|
.-------------. | .----------------------.
| pIOMMU | | | Bind FL for GVA-GPA |
| | | /'----------------------'
.----------------' |
| PASID Entry | V (Nested xlate)
'----------------..---------------------.
| | |Set SL to GPA-HPA |
| | '---------------------'
'-------------'
Option 2. Non-identity G-H PASID mapping diagram.
.-------------. .---------------------------.
| vIOMMU | | Guest process mm, FL only |
| | '---------------------------'
.----------------/
| PASID Entry |--- PASID cache flush -
'-------------'\ | .-------------.
| | \ | |Guest driver |
| | \ | |writes GPASID|
'-------------' \________________ | '-------------'
GPASID | |
Guest ^ ^ | |
------| Shadow |-------| VCMD |-----------|------------ |
v v | | | |
QEMU v v | |
GPASID = qemu_gpasid_alloc() | |
keep G->H PASID lookup | |
^ v |
| lookup G->H PASID |
-------------------|----------------------|------------ |
Host HPASID = ioasid_alloc() | |
| v |
| sva_bind_gpasid(HPASID,GPASID)|
| keep H-G PASID lookup |
| \ -------------------.
.-------------. | .----------------------. \| VDCM |
| pIOMMU | | | Bind FL for GVA-GPA | | H = lookup(GPASID)|
| | | /'----------------------' | write H to dev |
.----------------' | '------------------'
| PASID Entry | V (Nested xlate)
'----------------..---------------------.
| | |Set SL to GPA-HPA |
| | '---------------------'
'-------------'
There is also implications in G-H pasid lookup for PRQ, that would be
in the later series.
> > * @addr_width: Guest address width. Paging mode can also be
> > derived.
>
> What does the last sentence mean? @addr_width should probably be in
> @vtd if it provides implicit information.
>
Derive 4 or 5 level paging mode from the address width. It can be in
@vtd but i thought this can be generic.
> > * @vtd: Intel VT-d specific data
> > */
> > struct gpasid_bind_data {
> > #define IOMMU_GPASID_BIND_VERSION_1 1
> > __u32 version;
> > #define IOMMU_PASID_FORMAT_INTEL_VTD 1
> > __u32 format;
> > #define IOMMU_SVA_GPASID_VAL BIT(1) /* guest PASID
> > valid */
>
> (There are tabs between define and name here, as well as in the VT-d
> specific data)
>
> > __u64 flags;
> > __u64 gpgd;
> > __u64 hpasid;
> > __u64 gpasid;
> > __u32 addr_width;
>
> I think the union has to be aligned on 64-bit, otherwise a compiler
> might insert padding (https://lkml.org/lkml/2019/1/11/1207)
>
good point, will fix.
> Thanks,
> Jean
>
> > /* Vendor specific data */
> > union {
> > struct gpasid_bind_data_vtd vtd;
> > };
> > };
> >
> >
>
[Jacob Pan]
On 21/05/2019 23:50, Jacob Pan wrote:
>>> /**
>>> * struct gpasid_bind_data - Information about device and guest
>>> PASID binding
>>> * @version: Version of this data structure
>>> * @format: PASID table entry format
>>> * @flags: Additional information on guest bind request
>>> * @gpgd: Guest page directory base of the guest mm to bind
>>> * @hpasid: Process address space ID used for the guest mm
>>> in host IOMMU
>>> * @gpasid: Process address space ID used for the guest mm
>>> in guest IOMMU
>>
>> Trying to understand the full flow:
>> * @gpasid is the one allocated by the guest using a virtual command.
>> The guest writes @gpgd into the virtual PASID table at index @gpasid,
>> then sends an invalidate command to QEMU.
> yes
>> * QEMU issues a gpasid_bind ioctl (on the mdev or its container?).
>> VFIO forwards. The IOMMU driver installs @gpgd into the PASID table
>> using @hpasid, which is associated with the auxiliary domain.
>>
>> But why do we need the @hpasid field here? Does userspace know about
>> it at all, and does VFIO need to pass it to the IOMMU driver?
>>
> We need to support two guest-host PASID mappings through this API. Idea
> comes from Kevin & Yi.
> 1. identity mapping between host and guest PASID
> 2. guest owns its own pasid space
>
> For option 1, which will plan to support first in this series. There is
> no need for gpasid field since gpasid=hpasid. Guest allocates PASID
> using virtual command interface which gets a host PASID. Then PASID
> cache invalidation in the guest will result in bind_gpasid(), @gpasid is
> not valid in the bind data (indicated by the IOMMU_SVA_GPASID_VAL flag).
>
> For option 2, guest still uses virtual command to allocate guest pasid,
> but this time QEMU does the allocation for gpasid, at the same time
> QEMU will allocate a host pasid then maintain a G->H PASID lookup.
> When guest invalidate its PASID cache with GPASID, QEMU will find the
> match host PASID then pass both gpasid and hpasid down to the host IOMMU
> driver.
> Host IOMMU driver will store the gpgd at the hpasid entry but keep
> track of the gpasid->hpasid mapping. Host will never program gpasid in
> the IOMMU HW. Host IOMMU driver provides G->H PASID translation for PF
> device drivers that emulates mdev config space, i.e. virtual device
> composition module
> (https://events.linuxfoundation.org/wp-content/uploads/2017/12/Hardware-Assisted-Mediated-Pass-Through-with-VFIO-Kevin-Tian-Intel.pdf).
>
> These two options is a per VM choice. Hopefully the two diagrams below
> can help to explain. I will put them in the next patch headers.
Thanks for the explanation, makes sense to me now. So the host kernel
needs to know G->H because the guest may write GPASID into the config
space emulated by the host device driver, and device driver then
retrieves the HPASID via an iommu_ops callback? But the device driver
keeps track of aux domains so isn't HPASID retrievable with
aux_get_pasid() already?
>
> Option 1. Identity G-H PASID mapping diagram.
>
> .-------------. .---------------------------.
> | vIOMMU | | Guest process mm, FL only |
> | | '---------------------------'
> .----------------/
> | PASID Entry |--- PASID cache flush -
> '-------------'\ |
> | | \ |
> | | \ |
> '-------------' \________________ |
> GPASID = HPASID |
> Guest ^ ^ |
> ------| Shadow |-------| VCMD |-----------|------------
> v v | | |
> QEMU v v |
> ------------------------------------------|------------
> Host HPASID = ioasid_alloc() |
> | v
> | sva_bind_gpasid(HPASID)
> |
> .-------------. | .----------------------.
> | pIOMMU | | | Bind FL for GVA-GPA |
> | | | /'----------------------'
> .----------------' |
> | PASID Entry | V (Nested xlate)
> '----------------..---------------------.
> | | |Set SL to GPA-HPA |
> | | '---------------------'
> '-------------'
>
>
>
> Option 2. Non-identity G-H PASID mapping diagram.
>
> .-------------. .---------------------------.
> | vIOMMU | | Guest process mm, FL only |
> | | '---------------------------'
> .----------------/
> | PASID Entry |--- PASID cache flush -
> '-------------'\ | .-------------.
> | | \ | |Guest driver |
> | | \ | |writes GPASID|
> '-------------' \________________ | '-------------'
> GPASID | |
> Guest ^ ^ | |
> ------| Shadow |-------| VCMD |-----------|------------ |
> v v | | | |
> QEMU v v | |
> GPASID = qemu_gpasid_alloc() | |
> keep G->H PASID lookup | |
> ^ v |
> | lookup G->H PASID |
> -------------------|----------------------|------------ |
> Host HPASID = ioasid_alloc() | |
> | v |
> | sva_bind_gpasid(HPASID,GPASID)|
> | keep H-G PASID lookup |
> | \ -------------------.
> .-------------. | .----------------------. \| VDCM |
> | pIOMMU | | | Bind FL for GVA-GPA | | H = lookup(GPASID)|
> | | | /'----------------------' | write H to dev |
> .----------------' | '------------------'
> | PASID Entry | V (Nested xlate)
> '----------------..---------------------.
> | | |Set SL to GPA-HPA |
> | | '---------------------'
> '-------------'
> There is also implications in G-H pasid lookup for PRQ, that would be
> in the later series.
>
>>> * @addr_width: Guest address width. Paging mode can also be
>>> derived.
>>
>> What does the last sentence mean? @addr_width should probably be in
>> @vtd if it provides implicit information.
>>
> Derive 4 or 5 level paging mode from the address width. It can be in
> @vtd but i thought this can be generic.
Yes I think it's generic enough. It may be worth stating that this is
the *virtual* address width, and removing or clarifying what the paging
mode is (the sentence could be confusing on Arm, as we have different
page granules which cannot be derived from the address width)
Thanks,
Jean
On Wed, 22 May 2019 16:05:53 +0100
Jean-Philippe Brucker <[email protected]> wrote:
> On 21/05/2019 23:50, Jacob Pan wrote:
> >>> /**
> >>> * struct gpasid_bind_data - Information about device and guest
> >>> PASID binding
> >>> * @version: Version of this data structure
> >>> * @format: PASID table entry format
> >>> * @flags: Additional information on guest bind request
> >>> * @gpgd: Guest page directory base of the guest mm to bind
> >>> * @hpasid: Process address space ID used for the guest mm
> >>> in host IOMMU
> >>> * @gpasid: Process address space ID used for the guest mm
> >>> in guest IOMMU
> >>
> >> Trying to understand the full flow:
> >> * @gpasid is the one allocated by the guest using a virtual
> >> command. The guest writes @gpgd into the virtual PASID table at
> >> index @gpasid, then sends an invalidate command to QEMU.
> > yes
> >> * QEMU issues a gpasid_bind ioctl (on the mdev or its container?).
> >> VFIO forwards. The IOMMU driver installs @gpgd into the PASID table
> >> using @hpasid, which is associated with the auxiliary domain.
> >>
> >> But why do we need the @hpasid field here? Does userspace know
> >> about it at all, and does VFIO need to pass it to the IOMMU driver?
> >>
> > We need to support two guest-host PASID mappings through this API.
> > Idea comes from Kevin & Yi.
> > 1. identity mapping between host and guest PASID
> > 2. guest owns its own pasid space
> >
> > For option 1, which will plan to support first in this series.
> > There is no need for gpasid field since gpasid=hpasid. Guest
> > allocates PASID using virtual command interface which gets a host
> > PASID. Then PASID cache invalidation in the guest will result in
> > bind_gpasid(), @gpasid is not valid in the bind data (indicated by
> > the IOMMU_SVA_GPASID_VAL flag).
> >
> > For option 2, guest still uses virtual command to allocate guest
> > pasid, but this time QEMU does the allocation for gpasid, at the
> > same time QEMU will allocate a host pasid then maintain a G->H
> > PASID lookup. When guest invalidate its PASID cache with GPASID,
> > QEMU will find the match host PASID then pass both gpasid and
> > hpasid down to the host IOMMU driver.
> > Host IOMMU driver will store the gpgd at the hpasid entry but keep
> > track of the gpasid->hpasid mapping. Host will never program gpasid
> > in the IOMMU HW. Host IOMMU driver provides G->H PASID translation
> > for PF device drivers that emulates mdev config space, i.e. virtual
> > device composition module
> > (https://events.linuxfoundation.org/wp-content/uploads/2017/12/Hardware-Assisted-Mediated-Pass-Through-with-VFIO-Kevin-Tian-Intel.pdf).
> >
> > These two options is a per VM choice. Hopefully the two diagrams
> > below can help to explain. I will put them in the next patch
> > headers.
>
> Thanks for the explanation, makes sense to me now. So the host kernel
> needs to know G->H because the guest may write GPASID into the config
> space emulated by the host device driver, and device driver then
> retrieves the HPASID via an iommu_ops callback? But the device driver
> keeps track of aux domains so isn't HPASID retrievable with
> aux_get_pasid() already?
>
aux_get_pasid() will get domain's default pasid, which is used for
non-svm traffic on mdev. Here the gpasid bind is for svm only.
> >
> > Option 1. Identity G-H PASID mapping diagram.
> >
> > .-------------. .---------------------------.
> > | vIOMMU | | Guest process mm, FL only |
> > | | '---------------------------'
> > .----------------/
> > | PASID Entry |--- PASID cache flush -
> > '-------------'\ |
> > | | \ |
> > | | \ |
> > '-------------' \________________ |
> > GPASID = HPASID |
> > Guest ^ ^ |
> > ------| Shadow |-------| VCMD |-----------|------------
> > v v | | |
> > QEMU v v |
> > ------------------------------------------|------------
> > Host HPASID = ioasid_alloc() |
> > | v
> > | sva_bind_gpasid(HPASID)
> > |
> > .-------------. | .----------------------.
> > | pIOMMU | | | Bind FL for GVA-GPA |
> > | | | /'----------------------'
> > .----------------' |
> > | PASID Entry | V (Nested xlate)
> > '----------------..---------------------.
> > | | |Set SL to GPA-HPA |
> > | | '---------------------'
> > '-------------'
> >
> >
> >
> > Option 2. Non-identity G-H PASID mapping diagram.
> >
> > .-------------. .---------------------------.
> > | vIOMMU | | Guest process mm, FL only |
> > | | '---------------------------'
> > .----------------/
> > | PASID Entry |--- PASID cache flush -
> > '-------------'\ | .-------------.
> > | | \ | |Guest driver |
> > | | \ | |writes GPASID|
> > '-------------' \________________ | '-------------'
> > GPASID | |
> > Guest ^ ^ | |
> > ------| Shadow |-------| VCMD |-----------|------------ |
> > v v | | | |
> > QEMU v v | |
> > GPASID = qemu_gpasid_alloc() | |
> > keep G->H PASID lookup | |
> > ^ v |
> > | lookup G->H PASID |
> > -------------------|----------------------|------------ |
> > Host HPASID = ioasid_alloc() | |
> > | v |
> > | sva_bind_gpasid(HPASID,GPASID)|
> > | keep H-G PASID lookup |
> > | \
> > -------------------. .-------------. | .----------------------.
> > \| VDCM | | pIOMMU | | | Bind FL for GVA-GPA |
> > | H = lookup(GPASID)| | | | /'----------------------'
> > | write H to dev | .----------------' |
> > '------------------' | PASID Entry | V (Nested xlate)
> > '----------------..---------------------.
> > | | |Set SL to GPA-HPA |
> > | | '---------------------'
> > '-------------'
> > There is also implications in G-H pasid lookup for PRQ, that would
> > be in the later series.
> >
> >>> * @addr_width: Guest address width. Paging mode can also
> >>> be derived.
> >>
> >> What does the last sentence mean? @addr_width should probably be in
> >> @vtd if it provides implicit information.
> >>
> > Derive 4 or 5 level paging mode from the address width. It can be in
> > @vtd but i thought this can be generic.
>
> Yes I think it's generic enough. It may be worth stating that this is
> the *virtual* address width, and removing or clarifying what the
> paging mode is (the sentence could be confusing on Arm, as we have
> different page granules which cannot be derived from the address
> width)
>
OK, will keep addr_width as a generic field, then remove the paging
mode comment.
Thanks,
Jacob