LinuxLists.cc - [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID

2021-09-19 12:17:52

Subject: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

This patch adds IOASID allocation/free interface per iommufd. When
allocating an IOASID, userspace is expected to specify the type and
format information for the target I/O page table.

This RFC supports only one type (IOMMU_IOASID_TYPE_KERNEL_TYPE1V2),
implying a kernel-managed I/O page table with vfio type1v2 mapping
semantics. For this type the user should specify the addr_width of
the I/O address space and whether the I/O page table is created in
an iommu enfore_snoop format. enforce_snoop must be true at this point,
as the false setting requires additional contract with KVM on handling
WBINVD emulation, which can be added later.

Userspace is expected to call IOMMU_CHECK_EXTENSION (see next patch)
for what formats can be specified when allocating an IOASID.

Open:
- Devices on PPC platform currently use a different iommu driver in vfio.
Per previous discussion they can also use vfio type1v2 as long as there
is a way to claim a specific iova range from a system-wide address space.
This requirement doesn't sound PPC specific, as addr_width for pci devices
can be also represented by a range [0, 2^addr_width-1]. This RFC hasn't
adopted this design yet. We hope to have formal alignment in v1 discussion
and then decide how to incorporate it in v2.

- Currently ioasid term has already been used in the kernel (drivers/iommu/
ioasid.c) to represent the hardware I/O address space ID in the wire. It
covers both PCI PASID (Process Address Space ID) and ARM SSID (Sub-Stream
ID). We need find a way to resolve the naming conflict between the hardware
ID and software handle. One option is to rename the existing ioasid to be
pasid or ssid, given their full names still sound generic. Appreciate more
thoughts on this open!

Signed-off-by: Liu Yi L <[email protected]>
---
drivers/iommu/iommufd/iommufd.c | 120 ++++++++++++++++++++++++++++++++
include/linux/iommufd.h | 3 +
include/uapi/linux/iommu.h | 54 ++++++++++++++
3 files changed, 177 insertions(+)

diff --git a/drivers/iommu/iommufd/iommufd.c b/drivers/iommu/iommufd/iommufd.c
index 641f199f2d41..4839f128b24a 100644
--- a/drivers/iommu/iommufd/iommufd.c
+++ b/drivers/iommu/iommufd/iommufd.c
@@ -24,6 +24,7 @@
struct iommufd_ctx {
refcount_t refs;
struct mutex lock;
+ struct xarray ioasid_xa; /* xarray of ioasids */
struct xarray device_xa; /* xarray of bound devices */
};

@@ -42,6 +43,16 @@ struct iommufd_device {
u64 dev_cookie;
};

+/* Represent an I/O address space */
+struct iommufd_ioas {
+ int ioasid;
+ u32 type;
+ u32 addr_width;
+ bool enforce_snoop;
+ struct iommufd_ctx *ictx;
+ refcount_t refs;
+};
+
static int iommufd_fops_open(struct inode *inode, struct file *filep)
{
struct iommufd_ctx *ictx;
@@ -53,6 +64,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filep)

refcount_set(&ictx->refs, 1);
mutex_init(&ictx->lock);
+ xa_init_flags(&ictx->ioasid_xa, XA_FLAGS_ALLOC);
xa_init_flags(&ictx->device_xa, XA_FLAGS_ALLOC);
filep->private_data = ictx;

@@ -102,16 +114,118 @@ static void iommufd_ctx_put(struct iommufd_ctx *ictx)
if (!refcount_dec_and_test(&ictx->refs))
return;

+ WARN_ON(!xa_empty(&ictx->ioasid_xa));
WARN_ON(!xa_empty(&ictx->device_xa));
kfree(ictx);
}

+/* Caller should hold ictx->lock */
+static void ioas_put_locked(struct iommufd_ioas *ioas)
+{
+ struct iommufd_ctx *ictx = ioas->ictx;
+ int ioasid = ioas->ioasid;
+
+ if (!refcount_dec_and_test(&ioas->refs))
+ return;
+
+ xa_erase(&ictx->ioasid_xa, ioasid);
+ iommufd_ctx_put(ictx);
+ kfree(ioas);
+}
+
+static int iommufd_ioasid_alloc(struct iommufd_ctx *ictx, unsigned long arg)
+{
+ struct iommu_ioasid_alloc req;
+ struct iommufd_ioas *ioas;
+ unsigned long minsz;
+ int ioasid, ret;
+
+ minsz = offsetofend(struct iommu_ioasid_alloc, addr_width);
+
+ if (copy_from_user(&req, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (req.argsz < minsz || !req.addr_width ||
+ req.flags != IOMMU_IOASID_ENFORCE_SNOOP ||
+ req.type != IOMMU_IOASID_TYPE_KERNEL_TYPE1V2)
+ return -EINVAL;
+
+ ioas = kzalloc(sizeof(*ioas), GFP_KERNEL);
+ if (!ioas)
+ return -ENOMEM;
+
+ mutex_lock(&ictx->lock);
+ ret = xa_alloc(&ictx->ioasid_xa, &ioasid, ioas,
+ XA_LIMIT(IOMMUFD_IOASID_MIN, IOMMUFD_IOASID_MAX),
+ GFP_KERNEL);
+ mutex_unlock(&ictx->lock);
+ if (ret) {
+ pr_err_ratelimited("Failed to alloc ioasid\n");
+ kfree(ioas);
+ return ret;
+ }
+
+ ioas->ioasid = ioasid;
+
+ /* only supports kernel managed I/O page table so far */
+ ioas->type = IOMMU_IOASID_TYPE_KERNEL_TYPE1V2;
+
+ ioas->addr_width = req.addr_width;
+
+ /* only supports enforce snoop today */
+ ioas->enforce_snoop = true;
+
+ iommufd_ctx_get(ictx);
+ ioas->ictx = ictx;
+
+ refcount_set(&ioas->refs, 1);
+
+ return ioasid;
+}
+
+static int iommufd_ioasid_free(struct iommufd_ctx *ictx, unsigned long arg)
+{
+ struct iommufd_ioas *ioas = NULL;
+ int ioasid, ret;
+
+ if (copy_from_user(&ioasid, (void __user *)arg, sizeof(ioasid)))
+ return -EFAULT;
+
+ if (ioasid < 0)
+ return -EINVAL;
+
+ mutex_lock(&ictx->lock);
+ ioas = xa_load(&ictx->ioasid_xa, ioasid);
+ if (IS_ERR(ioas)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* Disallow free if refcount is not 1 */
+ if (refcount_read(&ioas->refs) > 1) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ ioas_put_locked(ioas);
+out_unlock:
+ mutex_unlock(&ictx->lock);
+ return ret;
+};
+
static int iommufd_fops_release(struct inode *inode, struct file *filep)
{
struct iommufd_ctx *ictx = filep->private_data;
+ struct iommufd_ioas *ioas;
+ unsigned long index;

filep->private_data = NULL;

+ mutex_lock(&ictx->lock);
+ xa_for_each(&ictx->ioasid_xa, index, ioas)
+ ioas_put_locked(ioas);
+ mutex_unlock(&ictx->lock);
+
iommufd_ctx_put(ictx);

return 0;
@@ -195,6 +309,12 @@ static long iommufd_fops_unl_ioctl(struct file *filep,
case IOMMU_DEVICE_GET_INFO:
ret = iommufd_get_device_info(ictx, arg);
break;
+ case IOMMU_IOASID_ALLOC:
+ ret = iommufd_ioasid_alloc(ictx, arg);
+ break;
+ case IOMMU_IOASID_FREE:
+ ret = iommufd_ioasid_free(ictx, arg);
+ break;
default:
pr_err_ratelimited("unsupported cmd %u\n", cmd);
break;
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 1603a13937e9..1dd6515e7816 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -14,6 +14,9 @@
#include <linux/err.h>
#include <linux/device.h>

+#define IOMMUFD_IOASID_MAX ((unsigned int)(0x7FFFFFFF))
+#define IOMMUFD_IOASID_MIN 0
+
#define IOMMUFD_DEVID_MAX ((unsigned int)(0x7FFFFFFF))
#define IOMMUFD_DEVID_MIN 0

diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 76b71f9d6b34..5cbd300eb0ee 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -57,6 +57,60 @@ struct iommu_device_info {

#define IOMMU_DEVICE_GET_INFO _IO(IOMMU_TYPE, IOMMU_BASE + 1)

+/*
+ * IOMMU_IOASID_ALLOC - _IOWR(IOMMU_TYPE, IOMMU_BASE + 2,
+ * struct iommu_ioasid_alloc)
+ *
+ * Allocate an IOASID.
+ *
+ * IOASID is the FD-local software handle representing an I/O address
+ * space. Each IOASID is associated with a single I/O page table. User
+ * must call this ioctl to get an IOASID for every I/O address space
+ * that is intended to be tracked by the kernel.
+ *
+ * User needs to specify the attributes of the IOASID and associated
+ * I/O page table format information according to one or multiple devices
+ * which will be attached to this IOASID right after. The I/O page table
+ * is activated in the IOMMU when it's attached by a device. Incompatible
+ * format between device and IOASID will lead to attaching failure in
+ * device side.
+ *
+ * Currently only one flag (IOMMU_IOASID_ENFORCE_SNOOP) is supported and
+ * must be always set.
+ *
+ * Only one I/O page table type (kernel-managed) is supported, with vfio
+ * type1v2 mapping semantics.
+ *
+ * User should call IOMMU_CHECK_EXTENSION for future extensions.
+ *
+ * @argsz: user filled size of this data.
+ * @flags: additional information for IOASID allocation.
+ * @type: I/O address space page table type.
+ * @addr_width: address width of the I/O address space.
+ *
+ * Return: allocated ioasid on success, -errno on failure.
+ */
+struct iommu_ioasid_alloc {
+ __u32 argsz;
+ __u32 flags;
+#define IOMMU_IOASID_ENFORCE_SNOOP (1 << 0)
+ __u32 type;
+#define IOMMU_IOASID_TYPE_KERNEL_TYPE1V2 1
+ __u32 addr_width;
+};
+
+#define IOMMU_IOASID_ALLOC _IO(IOMMU_TYPE, IOMMU_BASE + 2)
+
+/**
+ * IOMMU_IOASID_FREE - _IOWR(IOMMU_TYPE, IOMMU_BASE + 3, int)
+ *
+ * Free an IOASID.
+ *
+ * returns: 0 on success, -errno on failure
+ */
+
+#define IOMMU_IOASID_FREE _IO(IOMMU_TYPE, IOMMU_BASE + 3)
+
#define IOMMU_FAULT_PERM_READ (1 << 0) /* read */
#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */
#define IOMMU_FAULT_PERM_EXEC (1 << 2) /* exec */
--
2.25.1

2021-09-21 17:46:15

by Jason Gunthorpe

[permalink] [raw]

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

2021-09-22 03:41:34

by Tian, Kevin

[permalink] [raw]

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, September 22, 2021 1:45 AM
>
> On Sun, Sep 19, 2021 at 02:38:39PM +0800, Liu Yi L wrote:
> > This patch adds IOASID allocation/free interface per iommufd. When
> > allocating an IOASID, userspace is expected to specify the type and
> > format information for the target I/O page table.
> >
> > This RFC supports only one type (IOMMU_IOASID_TYPE_KERNEL_TYPE1V2),
> > implying a kernel-managed I/O page table with vfio type1v2 mapping
> > semantics. For this type the user should specify the addr_width of
> > the I/O address space and whether the I/O page table is created in
> > an iommu enfore_snoop format. enforce_snoop must be true at this point,
> > as the false setting requires additional contract with KVM on handling
> > WBINVD emulation, which can be added later.
> >
> > Userspace is expected to call IOMMU_CHECK_EXTENSION (see next patch)
> > for what formats can be specified when allocating an IOASID.
> >
> > Open:
> > - Devices on PPC platform currently use a different iommu driver in vfio.
> > Per previous discussion they can also use vfio type1v2 as long as there
> > is a way to claim a specific iova range from a system-wide address space.
> > This requirement doesn't sound PPC specific, as addr_width for pci
> devices
> > can be also represented by a range [0, 2^addr_width-1]. This RFC hasn't
> > adopted this design yet. We hope to have formal alignment in v1
> discussion
> > and then decide how to incorporate it in v2.
>
> I think the request was to include a start/end IO address hint when
> creating the ios. When the kernel creates it then it can return the

is the hint single-range or could be multiple-ranges?

> actual geometry including any holes via a query.

I'd like to see a detail flow from David on how the uAPI works today with
existing spapr driver and what exact changes he'd like to make on this
proposed interface. Above info is still insufficient for us to think about the
right solution.

>
> > - Currently ioasid term has already been used in the kernel
> (drivers/iommu/
> > ioasid.c) to represent the hardware I/O address space ID in the wire. It
> > covers both PCI PASID (Process Address Space ID) and ARM SSID (Sub-
> Stream
> > ID). We need find a way to resolve the naming conflict between the
> hardware
> > ID and software handle. One option is to rename the existing ioasid to be
> > pasid or ssid, given their full names still sound generic. Appreciate more
> > thoughts on this open!
>
> ioas works well here I think. Use ioas_id to refer to the xarray
> index.

What about when introducing pasid to this uAPI? Then use ioas_id
for the xarray index and ioasid to represent pasid/ssid? At this point
the software handle and hardware id are mixed together thus need
a clear terminology to differentiate them.

Thanks
Kevin

2021-09-22 12:55:36

by Yi Liu

[permalink] [raw]

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, September 22, 2021 1:45 AM
>
[...]
> > diff --git a/drivers/iommu/iommufd/iommufd.c
> b/drivers/iommu/iommufd/iommufd.c
> > index 641f199f2d41..4839f128b24a 100644
> > +++ b/drivers/iommu/iommufd/iommufd.c
> > @@ -24,6 +24,7 @@
> > struct iommufd_ctx {
> > refcount_t refs;
> > struct mutex lock;
> > + struct xarray ioasid_xa; /* xarray of ioasids */
> > struct xarray device_xa; /* xarray of bound devices */
> > };
> >
> > @@ -42,6 +43,16 @@ struct iommufd_device {
> > u64 dev_cookie;
> > };
> >
> > +/* Represent an I/O address space */
> > +struct iommufd_ioas {
> > + int ioasid;
>
> xarray id's should consistently be u32s everywhere.

sure. just one more check, this id is supposed to be returned to
userspace as the return value of ioctl(IOASID_ALLOC). That's why
I chose to use "int" as its prototype to make it aligned with the
return type of ioctl(). Based on this, do you think it's still better
to use "u32" here?

Regards,
Yi Liu

> Many of the same prior comments repeated here
>
> Jason

2021-09-22 13:36:39

by Jason Gunthorpe

[permalink] [raw]

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

On Wed, Sep 22, 2021 at 12:51:38PM +0000, Liu, Yi L wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Wednesday, September 22, 2021 1:45 AM
> >
> [...]
> > > diff --git a/drivers/iommu/iommufd/iommufd.c
> > b/drivers/iommu/iommufd/iommufd.c
> > > index 641f199f2d41..4839f128b24a 100644
> > > +++ b/drivers/iommu/iommufd/iommufd.c
> > > @@ -24,6 +24,7 @@
> > > struct iommufd_ctx {
> > > refcount_t refs;
> > > struct mutex lock;
> > > + struct xarray ioasid_xa; /* xarray of ioasids */
> > > struct xarray device_xa; /* xarray of bound devices */
> > > };
> > >
> > > @@ -42,6 +43,16 @@ struct iommufd_device {
> > > u64 dev_cookie;
> > > };
> > >
> > > +/* Represent an I/O address space */
> > > +struct iommufd_ioas {
> > > + int ioasid;
> >
> > xarray id's should consistently be u32s everywhere.
>
> sure. just one more check, this id is supposed to be returned to
> userspace as the return value of ioctl(IOASID_ALLOC). That's why
> I chose to use "int" as its prototype to make it aligned with the
> return type of ioctl(). Based on this, do you think it's still better
> to use "u32" here?

I suggest not using the return code from ioctl to exchange data.. The
rest of the uAPI uses an in/out struct, everything should do
that consistently.

Jason

2021-09-22 13:47:36

by Jean-Philippe Brucker

[permalink] [raw]

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

2021-09-22 14:14:17

by Jason Gunthorpe

[permalink] [raw]

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

On Wed, Sep 22, 2021 at 03:40:25AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Wednesday, September 22, 2021 1:45 AM
> >
> > On Sun, Sep 19, 2021 at 02:38:39PM +0800, Liu Yi L wrote:
> > > This patch adds IOASID allocation/free interface per iommufd. When
> > > allocating an IOASID, userspace is expected to specify the type and
> > > format information for the target I/O page table.
> > >
> > > This RFC supports only one type (IOMMU_IOASID_TYPE_KERNEL_TYPE1V2),
> > > implying a kernel-managed I/O page table with vfio type1v2 mapping
> > > semantics. For this type the user should specify the addr_width of
> > > the I/O address space and whether the I/O page table is created in
> > > an iommu enfore_snoop format. enforce_snoop must be true at this point,
> > > as the false setting requires additional contract with KVM on handling
> > > WBINVD emulation, which can be added later.
> > >
> > > Userspace is expected to call IOMMU_CHECK_EXTENSION (see next patch)
> > > for what formats can be specified when allocating an IOASID.
> > >
> > > Open:
> > > - Devices on PPC platform currently use a different iommu driver in vfio.
> > > Per previous discussion they can also use vfio type1v2 as long as there
> > > is a way to claim a specific iova range from a system-wide address space.
> > > This requirement doesn't sound PPC specific, as addr_width for pci
> > devices
> > > can be also represented by a range [0, 2^addr_width-1]. This RFC hasn't
> > > adopted this design yet. We hope to have formal alignment in v1
> > discussion
> > > and then decide how to incorporate it in v2.
> >
> > I think the request was to include a start/end IO address hint when
> > creating the ios. When the kernel creates it then it can return the
>
> is the hint single-range or could be multiple-ranges?

David explained it here:

https://lore.kernel.org/kvm/YMrKksUeNW%2FPEGPM@yekko/

qeumu needs to be able to chooose if it gets the 32 bit range or 64
bit range.

So a 'range hint' will do the job

David also suggested this:

https://lore.kernel.org/kvm/YL6%2FbjHyuHJTn4Rd@yekko/

So I like this better:

struct iommu_ioasid_alloc {
__u32 argsz;

__u32 flags;
#define IOMMU_IOASID_ENFORCE_SNOOP (1 << 0)
#define IOMMU_IOASID_HINT_BASE_IOVA (1 << 1)

__aligned_u64 max_iova_hint;
__aligned_u64 base_iova_hint; // Used only if IOMMU_IOASID_HINT_BASE_IOVA

// For creating nested page tables
__u32 parent_ios_id;
__u32 format;
#define IOMMU_FORMAT_KERNEL 0
#define IOMMU_FORMAT_PPC_XXX 2
#define IOMMU_FORMAT_[..]
u32 format_flags; // Layout depends on format above

__aligned_u64 user_page_directory; // Used if parent_ios_id != 0
};

Again 'type' as an overall API indicator should not exist, feature
flags need to have clear narrow meanings.

This does both of David's suggestions at once. If quemu wants the 1G
limited region it could specify max_iova_hint = 1G, if it wants the
extend 64bit region with the hole it can give either the high base or
a large max_iova_hint. format/format_flags allows a further
device-specific escape if more specific customization is needed and is
needed to specify user space page tables anyhow.

> > ioas works well here I think. Use ioas_id to refer to the xarray
> > index.
>
> What about when introducing pasid to this uAPI? Then use ioas_id
> for the xarray index

Yes, ioas_id should always be the xarray index.

PASID needs to be called out as PASID or as a generic "hw description"
blob.

kvm's API to program the vPASID translation table should probably take
in a (iommufd,ioas_id,device_id) tuple and extract the IOMMU side
information using an in-kernel API. Userspace shouldn't have to
shuttle it around.

I'm starting to feel like the struct approach for describing this uAPI
might not scale well, but lets see..

Jason

2021-09-23 06:30:24

by Yi Liu

[permalink] [raw]

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, September 22, 2021 9:32 PM
>
> On Wed, Sep 22, 2021 at 12:51:38PM +0000, Liu, Yi L wrote:
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Wednesday, September 22, 2021 1:45 AM
> > >
> > [...]
> > > > diff --git a/drivers/iommu/iommufd/iommufd.c
> > > b/drivers/iommu/iommufd/iommufd.c
> > > > index 641f199f2d41..4839f128b24a 100644
> > > > +++ b/drivers/iommu/iommufd/iommufd.c
> > > > @@ -24,6 +24,7 @@
> > > > struct iommufd_ctx {
> > > > refcount_t refs;
> > > > struct mutex lock;
> > > > + struct xarray ioasid_xa; /* xarray of ioasids */
> > > > struct xarray device_xa; /* xarray of bound devices */
> > > > };
> > > >
> > > > @@ -42,6 +43,16 @@ struct iommufd_device {
> > > > u64 dev_cookie;
> > > > };
> > > >
> > > > +/* Represent an I/O address space */
> > > > +struct iommufd_ioas {
> > > > + int ioasid;
> > >
> > > xarray id's should consistently be u32s everywhere.
> >
> > sure. just one more check, this id is supposed to be returned to
> > userspace as the return value of ioctl(IOASID_ALLOC). That's why
> > I chose to use "int" as its prototype to make it aligned with the
> > return type of ioctl(). Based on this, do you think it's still better
> > to use "u32" here?
>
> I suggest not using the return code from ioctl to exchange data.. The
> rest of the uAPI uses an in/out struct, everything should do
> that consistently.

got it.

Thanks,
Yi Liu

2021-09-23 09:16:33

by Tian, Kevin

[permalink] [raw]

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, September 22, 2021 10:09 PM
>
> On Wed, Sep 22, 2021 at 03:40:25AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Wednesday, September 22, 2021 1:45 AM
> > >
> > > On Sun, Sep 19, 2021 at 02:38:39PM +0800, Liu Yi L wrote:
> > > > This patch adds IOASID allocation/free interface per iommufd. When
> > > > allocating an IOASID, userspace is expected to specify the type and
> > > > format information for the target I/O page table.
> > > >
> > > > This RFC supports only one type
> (IOMMU_IOASID_TYPE_KERNEL_TYPE1V2),
> > > > implying a kernel-managed I/O page table with vfio type1v2 mapping
> > > > semantics. For this type the user should specify the addr_width of
> > > > the I/O address space and whether the I/O page table is created in
> > > > an iommu enfore_snoop format. enforce_snoop must be true at this
> point,
> > > > as the false setting requires additional contract with KVM on handling
> > > > WBINVD emulation, which can be added later.
> > > >
> > > > Userspace is expected to call IOMMU_CHECK_EXTENSION (see next
> patch)
> > > > for what formats can be specified when allocating an IOASID.
> > > >
> > > > Open:
> > > > - Devices on PPC platform currently use a different iommu driver in vfio.
> > > > Per previous discussion they can also use vfio type1v2 as long as there
> > > > is a way to claim a specific iova range from a system-wide address
> space.
> > > > This requirement doesn't sound PPC specific, as addr_width for pci
> > > devices
> > > > can be also represented by a range [0, 2^addr_width-1]. This RFC
> hasn't
> > > > adopted this design yet. We hope to have formal alignment in v1
> > > discussion
> > > > and then decide how to incorporate it in v2.
> > >
> > > I think the request was to include a start/end IO address hint when
> > > creating the ios. When the kernel creates it then it can return the
> >
> > is the hint single-range or could be multiple-ranges?
>
> David explained it here:
>
> https://lore.kernel.org/kvm/YMrKksUeNW%2FPEGPM@yekko/
>
> qeumu needs to be able to chooose if it gets the 32 bit range or 64
> bit range.
>
> So a 'range hint' will do the job
>
> David also suggested this:
>
> https://lore.kernel.org/kvm/YL6%2FbjHyuHJTn4Rd@yekko/
>
> So I like this better:
>
> struct iommu_ioasid_alloc {
> __u32 argsz;
>
> __u32 flags;
> #define IOMMU_IOASID_ENFORCE_SNOOP (1 << 0)
> #define IOMMU_IOASID_HINT_BASE_IOVA (1 << 1)
>
> __aligned_u64 max_iova_hint;
> __aligned_u64 base_iova_hint; // Used only if
> IOMMU_IOASID_HINT_BASE_IOVA
>
> // For creating nested page tables
> __u32 parent_ios_id;
> __u32 format;
> #define IOMMU_FORMAT_KERNEL 0
> #define IOMMU_FORMAT_PPC_XXX 2
> #define IOMMU_FORMAT_[..]
> u32 format_flags; // Layout depends on format above
>
> __aligned_u64 user_page_directory; // Used if parent_ios_id != 0
> };
>
> Again 'type' as an overall API indicator should not exist, feature
> flags need to have clear narrow meanings.

currently the type is aimed to differentiate three usages:

- kernel-managed I/O page table
- user-managed I/O page table
- shared I/O page table (e.g. with mm, or ept)

we can remove 'type', but is FORMAT_KENREL/USER/SHARED a good
indicator? their difference is not about format.

>
> This does both of David's suggestions at once. If quemu wants the 1G
> limited region it could specify max_iova_hint = 1G, if it wants the
> extend 64bit region with the hole it can give either the high base or
> a large max_iova_hint. format/format_flags allows a further

Dave's links didn't answer one puzzle from me. Does PPC needs accurate
range information or be ok with a large range including holes (then let
the kernel to figure out where the holes locate)?

> device-specific escape if more specific customization is needed and is
> needed to specify user space page tables anyhow.

and I didn't understand the 2nd link. How does user-managed page
table jump into this range claim problem? I'm getting confused...

>
> > > ioas works well here I think. Use ioas_id to refer to the xarray
> > > index.
> >
> > What about when introducing pasid to this uAPI? Then use ioas_id
> > for the xarray index
>
> Yes, ioas_id should always be the xarray index.
>
> PASID needs to be called out as PASID or as a generic "hw description"
> blob.

ARM doesn't use PASID. So we need a generic blob, e.g. ioas_hwid?

and still we have both ioas_id (iommufd) and ioasid (ioasid.c) in the
kernel. Do we want to clear this confusion? Or possibly it's fine because
ioas_id is never used outside of iommufd and iommufd doesn't directly
call ioasid_alloc() from ioasid.c?

>
> kvm's API to program the vPASID translation table should probably take
> in a (iommufd,ioas_id,device_id) tuple and extract the IOMMU side
> information using an in-kernel API. Userspace shouldn't have to
> shuttle it around.

the vPASID info is carried in VFIO_DEVICE_ATTACH_IOASID uAPI.
when kvm calls iommufd with above tuple, vPASID->pPASID is
returned to kvm. So we still need a generic blob to represent
vPASID in the uAPI.

>
> I'm starting to feel like the struct approach for describing this uAPI
> might not scale well, but lets see..
>
> Jason

2021-09-23 12:11:08

by Jason Gunthorpe

[permalink] [raw]

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

On Thu, Sep 23, 2021 at 09:14:58AM +0000, Tian, Kevin wrote:

> currently the type is aimed to differentiate three usages:
>
> - kernel-managed I/O page table
> - user-managed I/O page table
> - shared I/O page table (e.g. with mm, or ept)

Creating a shared ios is something that should probably be a different
command.

> we can remove 'type', but is FORMAT_KENREL/USER/SHARED a good
> indicator? their difference is not about format.

Format should be

FORMAT_KERNEL/FORMAT_INTEL_PTE_V1/FORMAT_INTEL_PTE_V2/etc

> Dave's links didn't answer one puzzle from me. Does PPC needs accurate
> range information or be ok with a large range including holes (then let
> the kernel to figure out where the holes locate)?

My impression was it only needed a way to select between the two
different cases as they are exclusive. I'd see this API as being a
hint and userspace should query the exact ranges to learn what was
actually created.

> > device-specific escape if more specific customization is needed and is
> > needed to specify user space page tables anyhow.
>
> and I didn't understand the 2nd link. How does user-managed page
> table jump into this range claim problem? I'm getting confused...

PPC could also model it using a FORMAT_KERNEL_PPC_X, FORMAT_KERNEL_PPC_Y
though it is less nice..

> > Yes, ioas_id should always be the xarray index.
> >
> > PASID needs to be called out as PASID or as a generic "hw description"
> > blob.
>
> ARM doesn't use PASID. So we need a generic blob, e.g. ioas_hwid?

ARM *does* need PASID! PASID is the label of the DMA on the PCI bus,
and it MUST be exposed in that format to be programmed into the PCI
device itself.

All of this should be able to support a userspace, like DPDK, creating
a PASID on its own without any special VFIO drivers.

- Open iommufd
- Attach the vfio device FD
- Request a PASID device id
- Create an ios against the pasid device id
- Query the ios for the PCI PASID #
- Program the HW to issue TLPs with the PASID

> and still we have both ioas_id (iommufd) and ioasid (ioasid.c) in the
> kernel. Do we want to clear this confusion? Or possibly it's fine because
> ioas_id is never used outside of iommufd and iommufd doesn't directly
> call ioasid_alloc() from ioasid.c?

As long as it is ioas_id and ioasid it is probably fine..

> > kvm's API to program the vPASID translation table should probably take
> > in a (iommufd,ioas_id,device_id) tuple and extract the IOMMU side
> > information using an in-kernel API. Userspace shouldn't have to
> > shuttle it around.
>
> the vPASID info is carried in VFIO_DEVICE_ATTACH_IOASID uAPI.
> when kvm calls iommufd with above tuple, vPASID->pPASID is
> returned to kvm. So we still need a generic blob to represent
> vPASID in the uAPI.

I think you have to be clear about what the value is being used
for. Is it an IOMMU page table handle or is it a PCI PASID value?

AFAICT I think it is the former in the Intel scheme as the "vPASID" is
really about presenting a consistent IOMMU handle to the guest across
migration, it is not the value that shows up on the PCI bus.

Jason

2021-09-23 12:24:52

by Tian, Kevin

[permalink] [raw]

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

> From: Jason Gunthorpe <[email protected]>
> Sent: Thursday, September 23, 2021 8:07 PM
>
> On Thu, Sep 23, 2021 at 09:14:58AM +0000, Tian, Kevin wrote:
>
> > currently the type is aimed to differentiate three usages:
> >
> > - kernel-managed I/O page table
> > - user-managed I/O page table
> > - shared I/O page table (e.g. with mm, or ept)
>
> Creating a shared ios is something that should probably be a different
> command.

why? I didn't understand the criteria here...

>
> > we can remove 'type', but is FORMAT_KENREL/USER/SHARED a good
> > indicator? their difference is not about format.
>
> Format should be
>
> FORMAT_KERNEL/FORMAT_INTEL_PTE_V1/FORMAT_INTEL_PTE_V2/etc

INTEL_PTE_V1/V2 are formats. Why is kernel-managed called a format?

>
> > Dave's links didn't answer one puzzle from me. Does PPC needs accurate
> > range information or be ok with a large range including holes (then let
> > the kernel to figure out where the holes locate)?
>
> My impression was it only needed a way to select between the two
> different cases as they are exclusive. I'd see this API as being a
> hint and userspace should query the exact ranges to learn what was
> actually created.

yes, the user can query the permitted range using DEVICE_GET_INFO.
But in the end if the user wants two separate regions, I'm afraid that
the underlying iommu driver wants to know the exact info. iirc PPC
has one global system address space shared by all devices. It is possible
that the user may want to claim range-A and range-C, with range-B
in-between but claimed by another user. Then simply using one hint
range [A-lowend, C-highend] might not work.

>
> > > device-specific escape if more specific customization is needed and is
> > > needed to specify user space page tables anyhow.
> >
> > and I didn't understand the 2nd link. How does user-managed page
> > table jump into this range claim problem? I'm getting confused...
>
> PPC could also model it using a FORMAT_KERNEL_PPC_X,
> FORMAT_KERNEL_PPC_Y
> though it is less nice..

yes PPC can use different format, but I didn't understand why it is
related user-managed page table which further requires nesting. sound
disconnected topics here...

>
> > > Yes, ioas_id should always be the xarray index.
> > >
> > > PASID needs to be called out as PASID or as a generic "hw description"
> > > blob.
> >
> > ARM doesn't use PASID. So we need a generic blob, e.g. ioas_hwid?
>
> ARM *does* need PASID! PASID is the label of the DMA on the PCI bus,
> and it MUST be exposed in that format to be programmed into the PCI
> device itself.

In the entire discussion in previous design RFC, I kept an impression that
ARM-equivalent PASID is called SSID. If we can use PASID as a general
term in iommufd context, definitely it's much better!

>
> All of this should be able to support a userspace, like DPDK, creating
> a PASID on its own without any special VFIO drivers.
>
> - Open iommufd
> - Attach the vfio device FD
> - Request a PASID device id
> - Create an ios against the pasid device id
> - Query the ios for the PCI PASID #
> - Program the HW to issue TLPs with the PASID

this all makes me very confused, and completely different from what
we agreed in previous v2 design proposal:

- open iommufd
- create an ioas
- attach vfio device to ioasid, with vPASID info
* vfio converts vPASID to pPASID and then call iommufd_device_attach_ioasid()
* the latter then installs ioas to the IOMMU with RID/PASID

>
> > and still we have both ioas_id (iommufd) and ioasid (ioasid.c) in the
> > kernel. Do we want to clear this confusion? Or possibly it's fine because
> > ioas_id is never used outside of iommufd and iommufd doesn't directly
> > call ioasid_alloc() from ioasid.c?
>
> As long as it is ioas_id and ioasid it is probably fine..

let's align with others in a few hours.

>
> > > kvm's API to program the vPASID translation table should probably take
> > > in a (iommufd,ioas_id,device_id) tuple and extract the IOMMU side
> > > information using an in-kernel API. Userspace shouldn't have to
> > > shuttle it around.
> >
> > the vPASID info is carried in VFIO_DEVICE_ATTACH_IOASID uAPI.
> > when kvm calls iommufd with above tuple, vPASID->pPASID is
> > returned to kvm. So we still need a generic blob to represent
> > vPASID in the uAPI.
>
> I think you have to be clear about what the value is being used
> for. Is it an IOMMU page table handle or is it a PCI PASID value?
>
> AFAICT I think it is the former in the Intel scheme as the "vPASID" is
> really about presenting a consistent IOMMU handle to the guest across
> migration, it is not the value that shows up on the PCI bus.
>

It's the former. But vfio driver needs to maintain vPASID->pPASID
translation in the mediation path, since what guest programs is vPASID.

Thanks
Kevin

2021-09-23 12:35:14

by Jason Gunthorpe

[permalink] [raw]

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

On Thu, Sep 23, 2021 at 12:22:23PM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Thursday, September 23, 2021 8:07 PM
> >
> > On Thu, Sep 23, 2021 at 09:14:58AM +0000, Tian, Kevin wrote:
> >
> > > currently the type is aimed to differentiate three usages:
> > >
> > > - kernel-managed I/O page table
> > > - user-managed I/O page table
> > > - shared I/O page table (e.g. with mm, or ept)
> >
> > Creating a shared ios is something that should probably be a different
> > command.
>
> why? I didn't understand the criteria here...

I suspect the input args will be very different, no?

> > > we can remove 'type', but is FORMAT_KENREL/USER/SHARED a good
> > > indicator? their difference is not about format.
> >
> > Format should be
> >
> > FORMAT_KERNEL/FORMAT_INTEL_PTE_V1/FORMAT_INTEL_PTE_V2/etc
>
> INTEL_PTE_V1/V2 are formats. Why is kernel-managed called a format?

So long as we are using structs we need to have values then the field
isn't being used. FORMAT_KERNEL is a reasonable value to have when we
are not creating a userspace page table.

Alternatively a userspace page table could have a different API

> yes, the user can query the permitted range using DEVICE_GET_INFO.
> But in the end if the user wants two separate regions, I'm afraid that
> the underlying iommu driver wants to know the exact info. iirc PPC
> has one global system address space shared by all devices. It is possible
> that the user may want to claim range-A and range-C, with range-B
> in-between but claimed by another user. Then simply using one hint
> range [A-lowend, C-highend] might not work.

I don't know, that sounds strange.. In any event hint is a hint, it
can be ignored, the only information the kernel needs to extract is
low/high bank?

> yes PPC can use different format, but I didn't understand why it is
> related user-managed page table which further requires nesting. sound
> disconnected topics here...

It is just a way to feed through more information if we get stuck
someday.

> > ARM *does* need PASID! PASID is the label of the DMA on the PCI bus,
> > and it MUST be exposed in that format to be programmed into the PCI
> > device itself.
>
> In the entire discussion in previous design RFC, I kept an impression that
> ARM-equivalent PASID is called SSID. If we can use PASID as a general
> term in iommufd context, definitely it's much better!

SSID is inside the chip and part of the IOMMU. PASID is part of the
PCI spec.

iommufd should keep these things distinct.

If we are talking about a PCI TLP then the name to use is PASID.

> > All of this should be able to support a userspace, like DPDK, creating
> > a PASID on its own without any special VFIO drivers.
> >
> > - Open iommufd
> > - Attach the vfio device FD
> > - Request a PASID device id
> > - Create an ios against the pasid device id
> > - Query the ios for the PCI PASID #
> > - Program the HW to issue TLPs with the PASID
>
> this all makes me very confused, and completely different from what
> we agreed in previous v2 design proposal:
>
> - open iommufd
> - create an ioas
> - attach vfio device to ioasid, with vPASID info
> * vfio converts vPASID to pPASID and then call iommufd_device_attach_ioasid()
> * the latter then installs ioas to the IOMMU with RID/PASID

This was your flow for mdev's, I've always been talking about wanting
to see this supported for all use cases, including physical PCI
devices w/ PASID support.

A normal vfio_pci userspace should be able to create PASIDs unrelated
to the mdev stuff.

> > AFAICT I think it is the former in the Intel scheme as the "vPASID" is
> > really about presenting a consistent IOMMU handle to the guest across
> > migration, it is not the value that shows up on the PCI bus.
>
> It's the former. But vfio driver needs to maintain vPASID->pPASID
> translation in the mediation path, since what guest programs is vPASID.

The pPASID definately is a PASID as it goes out on the PCIe wire

Suggest you come up with a more general name for vPASID?

Jason

2021-09-23 12:47:58

by Tian, Kevin

[permalink] [raw]

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

> From: Jason Gunthorpe <[email protected]>
> Sent: Thursday, September 23, 2021 8:31 PM
>
> On Thu, Sep 23, 2021 at 12:22:23PM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Thursday, September 23, 2021 8:07 PM
> > >
> > > On Thu, Sep 23, 2021 at 09:14:58AM +0000, Tian, Kevin wrote:
> > >
> > > > currently the type is aimed to differentiate three usages:
> > > >
> > > > - kernel-managed I/O page table
> > > > - user-managed I/O page table
> > > > - shared I/O page table (e.g. with mm, or ept)
> > >
> > > Creating a shared ios is something that should probably be a different
> > > command.
> >
> > why? I didn't understand the criteria here...
>
> I suspect the input args will be very different, no?

yes, but can't the structure be extended to incorporate it?

>
> > > > we can remove 'type', but is FORMAT_KENREL/USER/SHARED a good
> > > > indicator? their difference is not about format.
> > >
> > > Format should be
> > >
> > > FORMAT_KERNEL/FORMAT_INTEL_PTE_V1/FORMAT_INTEL_PTE_V2/etc
> >
> > INTEL_PTE_V1/V2 are formats. Why is kernel-managed called a format?
>
> So long as we are using structs we need to have values then the field
> isn't being used. FORMAT_KERNEL is a reasonable value to have when we
> are not creating a userspace page table.
>
> Alternatively a userspace page table could have a different API

I don't know. Your comments really confused me on what's the right
way to design the uAPI. If you still remember, the original v1 proposal
introduced different uAPIs for kernel/user-managed cases. Then you
recommended to consolidate everything related to ioas in one allocation
command.

Can you help articulate the criteria first?

>
> > yes, the user can query the permitted range using DEVICE_GET_INFO.
> > But in the end if the user wants two separate regions, I'm afraid that
> > the underlying iommu driver wants to know the exact info. iirc PPC
> > has one global system address space shared by all devices. It is possible
> > that the user may want to claim range-A and range-C, with range-B
> > in-between but claimed by another user. Then simply using one hint
> > range [A-lowend, C-highend] might not work.
>
> I don't know, that sounds strange.. In any event hint is a hint, it
> can be ignored, the only information the kernel needs to extract is
> low/high bank?

iirc Dave said that the user needs to claim a range explicitly. 'claim'
sounds not a hint to me. Possibly it's time for Dave to chime in.

>
> > yes PPC can use different format, but I didn't understand why it is
> > related user-managed page table which further requires nesting. sound
> > disconnected topics here...
>
> It is just a way to feed through more information if we get stuck
> someday.

You mean that we should define uAPI for all future possible extensions
now to minimize the frequency of changing it?

>
> > > ARM *does* need PASID! PASID is the label of the DMA on the PCI bus,
> > > and it MUST be exposed in that format to be programmed into the PCI
> > > device itself.
> >
> > In the entire discussion in previous design RFC, I kept an impression that
> > ARM-equivalent PASID is called SSID. If we can use PASID as a general
> > term in iommufd context, definitely it's much better!
>
> SSID is inside the chip and part of the IOMMU. PASID is part of the
> PCI spec.
>
> iommufd should keep these things distinct.
>
> If we are talking about a PCI TLP then the name to use is PASID.

If Jean doesn't object...

>
> > > All of this should be able to support a userspace, like DPDK, creating
> > > a PASID on its own without any special VFIO drivers.
> > >
> > > - Open iommufd
> > > - Attach the vfio device FD
> > > - Request a PASID device id
> > > - Create an ios against the pasid device id
> > > - Query the ios for the PCI PASID #
> > > - Program the HW to issue TLPs with the PASID
> >
> > this all makes me very confused, and completely different from what
> > we agreed in previous v2 design proposal:
> >
> > - open iommufd
> > - create an ioas
> > - attach vfio device to ioasid, with vPASID info
> > * vfio converts vPASID to pPASID and then call
> iommufd_device_attach_ioasid()
> > * the latter then installs ioas to the IOMMU with RID/PASID
>
> This was your flow for mdev's, I've always been talking about wanting
> to see this supported for all use cases, including physical PCI
> devices w/ PASID support.

this is not a flow for mdev. It's also required for pdev on Intel platform,
because the pasid table is in HPA space thus must be managed by host
kernel. Even no translation we still need the user to provide the pasid info.

>
> A normal vfio_pci userspace should be able to create PASIDs unrelated
> to the mdev stuff.
>
> > > AFAICT I think it is the former in the Intel scheme as the "vPASID" is
> > > really about presenting a consistent IOMMU handle to the guest across
> > > migration, it is not the value that shows up on the PCI bus.
> >
> > It's the former. But vfio driver needs to maintain vPASID->pPASID
> > translation in the mediation path, since what guest programs is vPASID.
>
> The pPASID definately is a PASID as it goes out on the PCIe wire
>
> Suggest you come up with a more general name for vPASID?
>

as explained earlier, on Intel platform the user always needs to provide
a PASID in the attaching call. whether it's directly used (for pdev)
or translated (for mdev) is the underlying driver thing. From kernel
p.o.v, since this PASID is provided by the user, it's fine to call it vPASID
in the uAPI.

Thanks
Kevin

2021-09-23 13:04:55

by Jason Gunthorpe

[permalink] [raw]

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

On Thu, Sep 23, 2021 at 12:45:17PM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Thursday, September 23, 2021 8:31 PM
> >
> > On Thu, Sep 23, 2021 at 12:22:23PM +0000, Tian, Kevin wrote:
> > > > From: Jason Gunthorpe <[email protected]>
> > > > Sent: Thursday, September 23, 2021 8:07 PM
> > > >
> > > > On Thu, Sep 23, 2021 at 09:14:58AM +0000, Tian, Kevin wrote:
> > > >
> > > > > currently the type is aimed to differentiate three usages:
> > > > >
> > > > > - kernel-managed I/O page table
> > > > > - user-managed I/O page table
> > > > > - shared I/O page table (e.g. with mm, or ept)
> > > >
> > > > Creating a shared ios is something that should probably be a different
> > > > command.
> > >
> > > why? I didn't understand the criteria here...
> >
> > I suspect the input args will be very different, no?
>
> yes, but can't the structure be extended to incorporate it?

You need to be thoughtful, giant structures with endless combinations
of optional fields turn out very hard. I haven't even seen what args
this shared thing will need, but I'm guessing it is almost none, so
maybe a new call is OK?

If it is literally just 'give me an ioas for current mm' then it has
no args or complexity at all.

> > > > > we can remove 'type', but is FORMAT_KENREL/USER/SHARED a good
> > > > > indicator? their difference is not about format.
> > > >
> > > > Format should be
> > > >
> > > > FORMAT_KERNEL/FORMAT_INTEL_PTE_V1/FORMAT_INTEL_PTE_V2/etc
> > >
> > > INTEL_PTE_V1/V2 are formats. Why is kernel-managed called a format?
> >
> > So long as we are using structs we need to have values then the field
> > isn't being used. FORMAT_KERNEL is a reasonable value to have when we
> > are not creating a userspace page table.
> >
> > Alternatively a userspace page table could have a different API
>
> I don't know. Your comments really confused me on what's the right
> way to design the uAPI. If you still remember, the original v1 proposal
> introduced different uAPIs for kernel/user-managed cases. Then you
> recommended to consolidate everything related to ioas in one allocation
> command.

This is because you had almost completely duplicated the input args
between the two calls.

If it turns out they have very different args, then they should have
different calls.

> > > - open iommufd
> > > - create an ioas
> > > - attach vfio device to ioasid, with vPASID info
> > > * vfio converts vPASID to pPASID and then call
> > iommufd_device_attach_ioasid()
> > > * the latter then installs ioas to the IOMMU with RID/PASID
> >
> > This was your flow for mdev's, I've always been talking about wanting
> > to see this supported for all use cases, including physical PCI
> > devices w/ PASID support.
>
> this is not a flow for mdev. It's also required for pdev on Intel platform,
> because the pasid table is in HPA space thus must be managed by host
> kernel. Even no translation we still need the user to provide the pasid info.

There should be no mandatory vPASID stuff in most of these flows, that
is just a special thing ENQCMD virtualization needs. If userspace
isn't doing ENQCMD virtualization it shouldn't need to touch this
stuff.

> as explained earlier, on Intel platform the user always needs to provide
> a PASID in the attaching call. whether it's directly used (for pdev)
> or translated (for mdev) is the underlying driver thing. From kernel
> p.o.v, since this PASID is provided by the user, it's fine to call it vPASID
> in the uAPI.

I've always disagreed with this. There should be an option for the
kernel to pick an appropriate PASID for portability to other IOMMUs
and simplicity of the interface.

You need to keep it clear what is in the minimum basic path and what
is needed for special cases, like ENQCMD virtualization.

Not every user of iommufd is doing virtualization.

Jason

2021-09-23 13:22:49

by Tian, Kevin

[permalink] [raw]

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

> From: Jason Gunthorpe <[email protected]>
> Sent: Thursday, September 23, 2021 9:02 PM
>
> On Thu, Sep 23, 2021 at 12:45:17PM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Thursday, September 23, 2021 8:31 PM
> > >
> > > On Thu, Sep 23, 2021 at 12:22:23PM +0000, Tian, Kevin wrote:
> > > > > From: Jason Gunthorpe <[email protected]>
> > > > > Sent: Thursday, September 23, 2021 8:07 PM
> > > > >
> > > > > On Thu, Sep 23, 2021 at 09:14:58AM +0000, Tian, Kevin wrote:
> > > > >
> > > > > > currently the type is aimed to differentiate three usages:
> > > > > >
> > > > > > - kernel-managed I/O page table
> > > > > > - user-managed I/O page table
> > > > > > - shared I/O page table (e.g. with mm, or ept)
> > > > >
> > > > > Creating a shared ios is something that should probably be a different
> > > > > command.
> > > >
> > > > why? I didn't understand the criteria here...
> > >
> > > I suspect the input args will be very different, no?
> >
> > yes, but can't the structure be extended to incorporate it?
>
> You need to be thoughtful, giant structures with endless combinations
> of optional fields turn out very hard. I haven't even seen what args
> this shared thing will need, but I'm guessing it is almost none, so
> maybe a new call is OK?

To judge this looks we may have to do some practice on this front
e.g. coming up an example structure for future intended usages and
then see whether one structure can fit?

>
> If it is literally just 'give me an ioas for current mm' then it has
> no args or complexity at all.

for mm, yes, should be simple. for ept it might be more complex e.g.
requiring a handle in kvm and some other format info to match ept
page table.

>
> > > > > > we can remove 'type', but is FORMAT_KENREL/USER/SHARED a good
> > > > > > indicator? their difference is not about format.
> > > > >
> > > > > Format should be
> > > > >
> > > > >
> FORMAT_KERNEL/FORMAT_INTEL_PTE_V1/FORMAT_INTEL_PTE_V2/etc
> > > >
> > > > INTEL_PTE_V1/V2 are formats. Why is kernel-managed called a format?
> > >
> > > So long as we are using structs we need to have values then the field
> > > isn't being used. FORMAT_KERNEL is a reasonable value to have when we
> > > are not creating a userspace page table.
> > >
> > > Alternatively a userspace page table could have a different API
> >
> > I don't know. Your comments really confused me on what's the right
> > way to design the uAPI. If you still remember, the original v1 proposal
> > introduced different uAPIs for kernel/user-managed cases. Then you
> > recommended to consolidate everything related to ioas in one allocation
> > command.
>
> This is because you had almost completely duplicated the input args
> between the two calls.
>
> If it turns out they have very different args, then they should have
> different calls.
>
> > > > - open iommufd
> > > > - create an ioas
> > > > - attach vfio device to ioasid, with vPASID info
> > > > * vfio converts vPASID to pPASID and then call
> > > iommufd_device_attach_ioasid()
> > > > * the latter then installs ioas to the IOMMU with RID/PASID
> > >
> > > This was your flow for mdev's, I've always been talking about wanting
> > > to see this supported for all use cases, including physical PCI
> > > devices w/ PASID support.
> >
> > this is not a flow for mdev. It's also required for pdev on Intel platform,
> > because the pasid table is in HPA space thus must be managed by host
> > kernel. Even no translation we still need the user to provide the pasid info.
>
> There should be no mandatory vPASID stuff in most of these flows, that
> is just a special thing ENQCMD virtualization needs. If userspace
> isn't doing ENQCMD virtualization it shouldn't need to touch this
> stuff.

No. for one, we also support SVA w/o using ENQCMD. For two, the key
is that the PASID table cannot be delegated to the userspace like ARM
or AMD. This implies that for any pasid that the userspace wants to
enable, it must be configured via the kernel.

>
> > as explained earlier, on Intel platform the user always needs to provide
> > a PASID in the attaching call. whether it's directly used (for pdev)
> > or translated (for mdev) is the underlying driver thing. From kernel
> > p.o.v, since this PASID is provided by the user, it's fine to call it vPASID
> > in the uAPI.
>
> I've always disagreed with this. There should be an option for the
> kernel to pick an appropriate PASID for portability to other IOMMUs
> and simplicity of the interface.
>
> You need to keep it clear what is in the minimum basic path and what
> is needed for special cases, like ENQCMD virtualization.
>
> Not every user of iommufd is doing virtualization.
>

just for a short summary of PASID model from previous design RFC:

for arm/amd:
- pasid space delegated to userspace
- pasid table delegated to userspace
- just one call to bind pasid_table() then pasids are fully managed by user

for intel:
- pasid table is always managed by kernel
- for pdev,
- pasid space is delegated to userspace
- attach_ioasid(dev, ioasid, pasid) so the kernel can setup the pasid entry
- for mdev,
- pasid space is managed by userspace
- attach_ioasid(dev, ioasid, vpasid). vfio converts vpasid to ppasid. iommufd setups the ppasid entry
- additional a contract to kvm for setup CPU pasid translation if enqcmd is used
- to unify pdev/mdev, just always call it vpasid in attach_ioasid(). let underlying driver to figure out whether vpasid should be translated.

Thanks
Kevin

2021-09-23 13:35:51

by Jason Gunthorpe

[permalink] [raw]

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

On Thu, Sep 23, 2021 at 01:20:55PM +0000, Tian, Kevin wrote:

> > > this is not a flow for mdev. It's also required for pdev on Intel platform,
> > > because the pasid table is in HPA space thus must be managed by host
> > > kernel. Even no translation we still need the user to provide the pasid info.
> >
> > There should be no mandatory vPASID stuff in most of these flows, that
> > is just a special thing ENQCMD virtualization needs. If userspace
> > isn't doing ENQCMD virtualization it shouldn't need to touch this
> > stuff.
>
> No. for one, we also support SVA w/o using ENQCMD. For two, the key
> is that the PASID table cannot be delegated to the userspace like ARM
> or AMD. This implies that for any pasid that the userspace wants to
> enable, it must be configured via the kernel.

Yes, configured through the kernel, but the simplified flow should
have the kernel handle everything and just emit a PASID for userspace
to use.

> just for a short summary of PASID model from previous design RFC:
>
> for arm/amd:
> - pasid space delegated to userspace
> - pasid table delegated to userspace
> - just one call to bind pasid_table() then pasids are fully managed by user
>
> for intel:
> - pasid table is always managed by kernel
> - for pdev,
> - pasid space is delegated to userspace
> - attach_ioasid(dev, ioasid, pasid) so the kernel can setup the pasid entry
> - for mdev,
> - pasid space is managed by userspace
> - attach_ioasid(dev, ioasid, vpasid). vfio converts vpasid to ppasid. iommufd setups the ppasid entry
> - additional a contract to kvm for setup CPU pasid translation if enqcmd is used
> - to unify pdev/mdev, just always call it vpasid in attach_ioasid(). let underlying driver to figure out whether vpasid should be translated.

All cases should support a kernel owned ioas associated with a
PASID. This is the universal basic API that all PASID supporting
IOMMUs need to implement.

I should not need to write generic users space that has to know how to
setup architecture specific nested userspace page tables just to use
PASID!

All of the above is qemu accelerated vIOMMU stuff. It is a good idea
to keep the two areas seperate as it greatly informs what is general
code and what is HW specific code.

Jason

2021-09-23 13:45:15

by Tian, Kevin

[permalink] [raw]

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

> From: Jason Gunthorpe <[email protected]>
> Sent: Thursday, September 23, 2021 9:31 PM
>
> On Thu, Sep 23, 2021 at 01:20:55PM +0000, Tian, Kevin wrote:
>
> > > > this is not a flow for mdev. It's also required for pdev on Intel platform,
> > > > because the pasid table is in HPA space thus must be managed by host
> > > > kernel. Even no translation we still need the user to provide the pasid
> info.
> > >
> > > There should be no mandatory vPASID stuff in most of these flows, that
> > > is just a special thing ENQCMD virtualization needs. If userspace
> > > isn't doing ENQCMD virtualization it shouldn't need to touch this
> > > stuff.
> >
> > No. for one, we also support SVA w/o using ENQCMD. For two, the key
> > is that the PASID table cannot be delegated to the userspace like ARM
> > or AMD. This implies that for any pasid that the userspace wants to
> > enable, it must be configured via the kernel.
>
> Yes, configured through the kernel, but the simplified flow should
> have the kernel handle everything and just emit a PASID for userspace
> to use.
>
>
> > just for a short summary of PASID model from previous design RFC:
> >
> > for arm/amd:
> > - pasid space delegated to userspace
> > - pasid table delegated to userspace
> > - just one call to bind pasid_table() then pasids are fully managed by
> user
> >
> > for intel:
> > - pasid table is always managed by kernel
> > - for pdev,
> > - pasid space is delegated to userspace
> > - attach_ioasid(dev, ioasid, pasid) so the kernel can setup the
> pasid entry
> > - for mdev,
> > - pasid space is managed by userspace
> > - attach_ioasid(dev, ioasid, vpasid). vfio converts vpasid to
> ppasid. iommufd setups the ppasid entry
> > - additional a contract to kvm for setup CPU pasid translation
> if enqcmd is used
> > - to unify pdev/mdev, just always call it vpasid in attach_ioasid(). let
> underlying driver to figure out whether vpasid should be translated.
>
> All cases should support a kernel owned ioas associated with a
> PASID. This is the universal basic API that all PASID supporting
> IOMMUs need to implement.
>
> I should not need to write generic users space that has to know how to
> setup architecture specific nested userspace page tables just to use
> PASID!

ah, got you! I have to admit that my previous thoughts are all from
VM p.o.v, with true userspace application ignored...

>
> All of the above is qemu accelerated vIOMMU stuff. It is a good idea
> to keep the two areas seperate as it greatly informs what is general
> code and what is HW specific code.
>

Agree. will think more along this direction. possibly this discussion
deviated a lot from what this skeleton series provide. We still have
plenty of time to figure it out when starting the pasid support. For now
at least the minimal output is that PASID might be a good candidate to
be used in iommufd. ????

Thanks
Kevin

2021-09-29 11:51:54

by Yi Liu

[permalink] [raw]

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

> From: Jean-Philippe Brucker <[email protected]>
> Sent: Wednesday, September 22, 2021 9:45 PM
>
> On Sun, Sep 19, 2021 at 02:38:39PM +0800, Liu Yi L wrote:
> > This patch adds IOASID allocation/free interface per iommufd. When
> > allocating an IOASID, userspace is expected to specify the type and
> > format information for the target I/O page table.
> >
> > This RFC supports only one type
> (IOMMU_IOASID_TYPE_KERNEL_TYPE1V2),
> > implying a kernel-managed I/O page table with vfio type1v2 mapping
> > semantics. For this type the user should specify the addr_width of
> > the I/O address space and whether the I/O page table is created in
> > an iommu enfore_snoop format. enforce_snoop must be true at this
> point,
> > as the false setting requires additional contract with KVM on handling
> > WBINVD emulation, which can be added later.
> >
> > Userspace is expected to call IOMMU_CHECK_EXTENSION (see next patch)
> > for what formats can be specified when allocating an IOASID.
> >
> > Open:
> > - Devices on PPC platform currently use a different iommu driver in vfio.
> > Per previous discussion they can also use vfio type1v2 as long as there
> > is a way to claim a specific iova range from a system-wide address space.
>
> Is this the reason for passing addr_width to IOASID_ALLOC? I didn't get
> what it's used for or why it's mandatory. But for PPC it sounds like it
> should be an address range instead of an upper limit?

yes, as this open described, it may need to be a range. But not sure
if PPC requires multiple ranges or just one range. Perhaps, David may
guide there.

Regards,
Yi Liu

> Thanks,
> Jean
>
> > This requirement doesn't sound PPC specific, as addr_width for pci
> devices
> > can be also represented by a range [0, 2^addr_width-1]. This RFC hasn't
> > adopted this design yet. We hope to have formal alignment in v1
> discussion
> > and then decide how to incorporate it in v2.
> >
> > - Currently ioasid term has already been used in the kernel
> (drivers/iommu/
> > ioasid.c) to represent the hardware I/O address space ID in the wire. It
> > covers both PCI PASID (Process Address Space ID) and ARM SSID (Sub-
> Stream
> > ID). We need find a way to resolve the naming conflict between the
> hardware
> > ID and software handle. One option is to rename the existing ioasid to be
> > pasid or ssid, given their full names still sound generic. Appreciate more
> > thoughts on this open!

2021-10-01 06:31:51

Subject: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Subject: Re: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE

Attachments:

Subject: RE: [RFC 11/20] iommu/iommufd: Add IOMMU_IOASID_ALLOC/FREE